diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -134,7 +134,8 @@
   /// Tries to vectorize constructs started from CmpInst, InsertValueInst or
   /// InsertElementInst instructions.
   bool vectorizeSimpleInstructions(SmallVectorImpl<Instruction *> &Instructions,
-                                   BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+                                   BasicBlock *BB, slpvectorizer::BoUpSLP &R,
+                                   bool AtTerminator);
 
   /// Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -174,6 +174,20 @@
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
 
+// FIXME: These 2 options are required to avoid regressions in O3+LTO because of
+// too early optimizations at compile time.
+static cl::opt<unsigned>
+    MinNonPow2StoresSize("slp-min-non-power2-stores-size", cl::init(6),
+                         cl::Hidden,
+                         cl::desc("The minimum number of non-power-2 stores to "
+                                  "vectorize to try to use masked stores."));
+
+static cl::opt<unsigned>
+    MinNonPow2ValuesSize("slp-min-non-power2-values-size", cl::init(4),
+                         cl::Hidden,
+                         cl::desc("The minimum number of non-power-2 non-store "
+                                  "values to try the vectorization."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -201,20 +215,14 @@
 
 /// \returns true if all of the instructions in \p VL are in the same block or
 /// false otherwise.
-static bool allSameBlock(ArrayRef<Value *> VL) {
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
-    return false;
+template <typename T> static bool allSameBlock(T &&VL) {
+  if (empty(VL))
+    return true;
+  auto *I0 = cast<Instruction>(*VL.begin());
   BasicBlock *BB = I0->getParent();
-  for (int I = 1, E = VL.size(); I < E; I++) {
-    auto *II = dyn_cast<Instruction>(VL[I]);
-    if (!II)
-      return false;
-
-    if (BB != II->getParent())
-      return false;
-  }
-  return true;
+  return all_of(drop_begin(VL, 1), [BB](Value *V) {
+    return BB == cast<Instruction>(V)->getParent();
+  });
 }
 
 /// \returns True if all of the values in \p VL are constants (but not
@@ -228,11 +236,19 @@
   return true;
 }
 
-/// \returns True if all of the values in \p VL are identical.
+/// \returns True if all defined values in \p VL are identical.
 static bool isSplat(ArrayRef<Value *> VL) {
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    if (VL[i] != VL[0])
+  Value *VL0 = nullptr;
+  for (Value *V : VL) {
+    if (isa<UndefValue>(V))
+      continue;
+    if (!VL0) {
+      VL0 = V;
+      continue;
+    }
+    if (V != VL0)
       return false;
+  }
   return true;
 }
 
@@ -407,9 +423,25 @@
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
-  // Make sure these are all Instructions.
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+  // Make sure these are all Instructions or UndefValues.
+  auto &&IsNotInstructionOrAllUndefs = [](ArrayRef<Value *> VL) {
+    bool AllUndefs = true;
+    for (Value *V : VL) {
+      if (!isa<UndefValue>(V)) {
+        if (isa<Instruction>(V)) {
+          AllUndefs = false;
+          continue;
+        }
+        return true;
+      }
+    }
+    return AllUndefs;
+  };
+  if (IsNotInstructionOrAllUndefs(VL))
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  BaseIndex =
+      std::distance(VL.begin(), llvm::find_if(llvm::drop_begin(VL, BaseIndex),
+                                              Instruction::classof));
 
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
@@ -420,6 +452,8 @@
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+    if (isa<UndefValue>(VL[Cnt]))
+      continue;
     unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
@@ -537,9 +571,10 @@
                                SmallVectorImpl<int> &Mask) {
   Mask.clear();
   const unsigned E = Indices.size();
-  Mask.resize(E, E + 1);
+  Mask.resize(E, UndefMaskElem);
   for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
+    if (Indices[I] < E)
+      Mask[Indices[I]] = I;
 }
 
 namespace slpvectorizer {
@@ -616,7 +651,10 @@
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
+    EntryVFs.clear();
     MustGather.clear();
+    GatheredLoads.clear();
+    GatheredLoadsEntriesFirst = -1;
     ExternalUses.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
@@ -625,6 +663,7 @@
       BS->clear();
     }
     MinBWs.clear();
+    InstrElementSize.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -672,20 +711,24 @@
     // If the leaf has the same number of instructions to vectorize as the root
     // - order must be set already.
     unsigned RootSize = VectorizableTree[0]->Scalars.size();
-    if (Order.size() == RootSize)
+    // Checks if the order is normalized relatively the root node, i.e. it has
+    // the same number of undef elements (undef element is equal to RootSize
+    // value) as the root node scalars.
+    auto &&IsNormalizedOrder = [this, RootSize](const OrdersType &Order) {
+      return count(Order, RootSize) ==
+             count_if(VectorizableTree[0]->Scalars, UndefValue::classof);
+    };
+    // Check if the current order has the same number of undefined elements as
+    // the root node.
+    if (IsNormalizedOrder(Order))
       return;
-    SmallVector<unsigned, 4> RealOrder(Order.size());
-    std::swap(Order, RealOrder);
-    SmallVector<int, 4> Mask;
-    inversePermutation(RealOrder, Mask);
-    Order.assign(Mask.begin(), Mask.end());
     // The leaf has less number of instructions - need to find the true order of
     // the root.
     // Scan the nodes starting from the leaf back to the root.
     const TreeEntry *PNode = VectorizableTree.back().get();
     SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
     SmallPtrSet<const TreeEntry *, 4> Visited;
-    while (!Nodes.empty() && Order.size() != RootSize) {
+    while (!Nodes.empty() && !IsNormalizedOrder(Order)) {
       const TreeEntry *PNode = Nodes.pop_back_val();
       if (!Visited.insert(PNode).second)
         continue;
@@ -696,8 +739,10 @@
       if (Node.ReuseShuffleIndices.empty())
         continue;
       // Build the order for the parent node.
-      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
-      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
+      SmallVector<int, 4> Mask;
+      inversePermutation(Order, Mask);
+      Order.assign(RootSize, RootSize);
+      SmallVector<unsigned, 4> OrderCounter(RootSize + 1, 0);
       // The algorithm of the order extension is:
       // 1. Calculate the number of the same instructions for the order.
       // 2. Calculate the index of the new order: total number of instructions
@@ -706,28 +751,33 @@
       // 3. The new order is just the index of the instruction in the original
       // vector of the instructions.
       for (unsigned I : Node.ReuseShuffleIndices)
-        ++OrderCounter[Order[I]];
-      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
+        if (I != RootSize && Mask[I] != UndefMaskElem)
+          ++OrderCounter[Mask[I]];
+      SmallVector<unsigned, 4> CurrentCounter(Order.size() + 1, 0);
       for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
         unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
-        unsigned OrderIdx = Order[ReusedIdx];
+        if (ReusedIdx == RootSize)
+          continue;
+        int OrderIdx = Mask[ReusedIdx];
+        if (OrderIdx == UndefMaskElem) {
+          // Special case where the UndefValue is actually a real operand. Need
+          // to expand the order taking this UndefValue into account.
+          OrderIdx = RootSize;
+        }
         unsigned NewIdx = 0;
-        for (unsigned J = 0; J < OrderIdx; ++J)
+        for (int J = 0; J < OrderIdx; ++J)
           NewIdx += OrderCounter[J];
         NewIdx += CurrentCounter[OrderIdx];
         ++CurrentCounter[OrderIdx];
-        assert(NewOrder[NewIdx] == RootSize &&
+        assert(Order[NewIdx] == RootSize &&
                "The order index should not be written already.");
-        NewOrder[NewIdx] = I;
+        Order[NewIdx] = I;
       }
-      std::swap(Order, NewOrder);
     }
-    assert(Order.size() == RootSize &&
-           "Root node is expected or the size of the order must be the same as "
-           "the number of elements in the root node.");
-    assert(llvm::all_of(Order,
-                        [RootSize](unsigned Val) { return Val != RootSize; }) &&
-           "All indices must be initialized");
+    // The order must be normalized relatively the root node  after the
+    // function.
+    assert(IsNormalizedOrder(Order) &&
+           "Indices for all non-undefs must be set.");
   }
 
   /// \return The vector element size in bits to use when vectorizing the
@@ -867,6 +917,7 @@
     /// accessing a consecutive address. These strategies are summarized in the
     /// 'ReorderingMode' enumerator.
     enum class ReorderingMode {
+      Unknown,  ///< Mode is not defined yet
       Load,     ///< Matching loads to consecutive memory addresses
       Opcode,   ///< Matching instructions based on opcode (same or alternate)
       Constant, ///< Matching constants
@@ -882,6 +933,12 @@
     const DataLayout &DL;
     ScalarEvolution &SE;
     const BoUpSLP &R;
+    /// Base instruction in the list of scalars, the first instruction with the
+    /// main opcode.
+    Instruction &VL0;
+    /// Number of lanes in the node, i.e. PowerOf2Ceil(number of instructions in
+    /// the node).
+    unsigned NumLanes = 0;
 
     /// \returns the operand data at \p OpIdx and \p Lane.
     OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -907,18 +964,25 @@
       std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
     }
 
-    // The hard-coded scores listed here are not very important. When computing
-    // the scores of matching one sub-tree with another, we are basically
-    // counting the number of values that are matching. So even if all scores
-    // are set to 1, we would still get a decent matching result.
+    // The hard-coded scores listed here are not very important, though it shall
+    // be higher for better matches to iimprove the resulting cost. When
+    // computing the scores of matching one sub-tree with another, we are
+    // basically counting the number of values that are matching. So even if all
+    // scores are set to 1, we would still get a decent matching result.
     // However, sometimes we have to break ties. For example we may have to
     // choose between matching loads vs matching opcodes. This is what these
-    // scores are helping us with: they provide the order of preference.
+    // scores are helping us with: they provide the order of preference. Also,
+    // this is improtant if the scalar is externally used or used in another
+    // tree entry node in the different lane.
 
     /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
-    static const int ScoreConsecutiveLoads = 3;
+    static const int ScoreConsecutiveLoads = 4;
+    /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
+    static const int ScoreReversedLoads = 3;
     /// ExtractElementInst from same vector and consecutive indexes.
-    static const int ScoreConsecutiveExtracts = 3;
+    static const int ScoreConsecutiveExtracts = 4;
+    /// ExtractElementInst from same vector and reversed indices.
+    static const int ScoreReversedExtracts = 3;
     /// Constants.
     static const int ScoreConstants = 2;
     /// Instructions with the same opcode.
@@ -938,7 +1002,10 @@
 
     /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
     static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
-                               ScalarEvolution &SE) {
+                               ScalarEvolution &SE, int NumLanes) {
+      if (V1 == V2)
+        return VLOperands::ScoreSplat;
+
       auto *LI1 = dyn_cast<LoadInst>(V1);
       auto *LI2 = dyn_cast<LoadInst>(V2);
       if (LI1 && LI2) {
@@ -948,29 +1015,45 @@
         Optional<int> Dist =
             getPointersDiff(LI1->getPointerOperand(), LI2->getPointerOperand(),
                             DL, SE, /*StrictCheck=*/true);
-        return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
-                                    : VLOperands::ScoreFail;
+        if (!Dist)
+          return VLOperands::ScoreFail;
+        // The distance is too large - still may be profitable to use masked
+        // loads/gathers.
+        if (std::abs(*Dist) > NumLanes / 2)
+          return VLOperands::ScoreAltOpcodes;
+        return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
+                           : VLOperands::ScoreReversedLoads;
       }
 
       auto *C1 = dyn_cast<Constant>(V1);
       auto *C2 = dyn_cast<Constant>(V2);
-      if (C1 && C2)
+      if (C1 && C2 && !isa<UndefValue>(V2))
         return VLOperands::ScoreConstants;
 
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
       Value *EV;
       ConstantInt *Ex1Idx, *Ex2Idx;
-      if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
-          match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
-          Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
-        return VLOperands::ScoreConsecutiveExtracts;
+      if (match(V2, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex2Idx)))) {
+        if (match(V1, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex1Idx)))) {
+          int Idx1 = Ex1Idx->getZExtValue();
+          int Idx2 = Ex2Idx->getZExtValue();
+          int Dist = Idx2 - Idx1;
+          // The distance is too large - still may be profitable to use
+          // shuffles.
+          if (std::abs(Dist) > NumLanes / 2)
+            return VLOperands::ScoreAltOpcodes;
+          return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
+                            : VLOperands::ScoreReversedExtracts;
+        }
+        return VLOperands::ScoreFail;
+      }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (I1 && I2) {
-        if (I1 == I2)
-          return VLOperands::ScoreSplat;
+        if (I1->getParent() != I2->getParent())
+          return VLOperands::ScoreFail;
         InstructionsState S = getSameOpcode({I1, I2});
         // Note: Only consider instructions with <= 2 operands to avoid
         // complexity explosion.
@@ -985,9 +1068,11 @@
       return VLOperands::ScoreFail;
     }
 
-    /// Holds the values and their lane that are taking part in the look-ahead
+    /// Holds the values and their lanes that are taking part in the look-ahead
     /// score calculation. This is used in the external uses cost calculation.
-    SmallDenseMap<Value *, int> InLookAheadValues;
+    /// Need to hold all the lanes in case of splat/broadcast at least to
+    /// correctly check for the use in the different lane.
+    SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
 
     /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
     /// either external to the vectorized code, or require shuffling.
@@ -1017,18 +1102,28 @@
             assert(It != UserTE->Scalars.end() && "U is in UserTE");
             int UserLn = std::distance(UserTE->Scalars.begin(), It);
             assert(UserLn >= 0 && "Bad lane");
-            if (UserLn != Ln)
+            // If the values are different, check just the line of the current
+            // value. If the values are the same, need to add UserInDiffLaneCost
+            // only if UserLn does not match both line numbers.
+            if ((LHS.first != RHS.first && UserLn != Ln) ||
+                (LHS.first == RHS.first && UserLn != LHS.second &&
+                 UserLn != RHS.second)) {
               Cost += UserInDiffLaneCost;
+              break;
+            }
           } else {
             // Check if the user is in the look-ahead code.
             auto It2 = InLookAheadValues.find(U);
             if (It2 != InLookAheadValues.end()) {
               // The user is in the look-ahead code. Check the lane.
-              if (It2->second != Ln)
+              if (!It2->getSecond().contains(Ln)) {
                 Cost += UserInDiffLaneCost;
+                break;
+              }
             } else {
               // The user is neither in SLP tree nor in the look-ahead code.
               Cost += ExternalUseCost;
+              break;
             }
           }
           // Limit the number of visited uses to cap compilation time.
@@ -1067,27 +1162,31 @@
       Value *V1 = LHS.first;
       Value *V2 = RHS.first;
       // Get the shallow score of V1 and V2.
-      int ShallowScoreAtThisLevel =
-          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
-                                       getExternalUsesCost(LHS, RHS));
+      int ShallowScoreAtThisLevel = std::max(
+          (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
+                              getExternalUsesCost(LHS, RHS));
       int Lane1 = LHS.second;
       int Lane2 = RHS.second;
 
       // If reached MaxLevel,
       //  or if V1 and V2 are not instructions,
       //  or if they are SPLAT,
-      //  or if they are not consecutive, early return the current cost.
+      //  or if they are not consecutive,
+      //  or if profitable to vectorize loads or extractelements, early return
+      //  the current cost.
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
           ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
-          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+          (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+            (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
+           ShallowScoreAtThisLevel))
         return ShallowScoreAtThisLevel;
       assert(I1 && I2 && "Should have early exited.");
 
       // Keep track of in-tree values for determining the external-use cost.
-      InLookAheadValues[V1] = Lane1;
-      InLookAheadValues[V2] = Lane2;
+      InLookAheadValues[V1].insert(Lane1);
+      InLookAheadValues[V2].insert(Lane2);
 
       // Contains the I2 operand indexes that got matched with I1 operands.
       SmallSet<unsigned, 4> Op2Used;
@@ -1167,6 +1266,7 @@
       } BestOp;
 
       // Iterate through all unused operands and look for the best.
+      bool IsOpLastLaneUndef = isa<UndefValue>(OpLastLane);
       for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
         // Get the operand at Idx and Lane.
         OperandData &OpData = getData(Idx, Lane);
@@ -1183,6 +1283,15 @@
         if (OpAPO != OpIdxAPO)
           continue;
 
+        // Ignore two undefs.
+        if (IsOpLastLaneUndef && isa<UndefValue>(Op)) {
+          if (BestOp.Score < VLOperands::ScoreUndef) {
+            BestOp.Idx = Idx;
+            BestOp.Score = VLOperands::ScoreUndef;
+          }
+          continue;
+        }
+
         // Look for an operand that matches the current mode.
         switch (RMode) {
         case ReorderingMode::Load:
@@ -1200,11 +1309,14 @@
           break;
         }
         case ReorderingMode::Splat:
-          if (Op == OpLastLane)
+          // Undef is also can be part of splat/broadcast.
+          if (Op == OpLastLane || IsOpLastLaneUndef || isa<UndefValue>(Op))
             BestOp.Idx = Idx;
           break;
         case ReorderingMode::Failed:
           return None;
+        case ReorderingMode::Unknown:
+          llvm_unreachable("Unknown mode is not expected here.");
         }
       }
 
@@ -1218,15 +1330,27 @@
 
     /// Helper for reorderOperandVecs. \Returns the lane that we should start
     /// reordering from. This is the one which has the least number of operands
-    /// that can freely move about.
+    /// that can freely move about or less profitable because it already has the
+    /// most optimal set of operands.
     unsigned getBestLaneToStartReordering() const {
       unsigned BestLane = 0;
       unsigned Min = UINT_MAX;
-      for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
-           ++Lane) {
-        unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
-        if (NumFreeOps < Min) {
-          Min = NumFreeOps;
+      unsigned SameOpNumber = 0;
+      for (int I = getNumLanes(); I > 0; --I) {
+        unsigned Lane = I - 1;
+        std::pair<unsigned, unsigned> NumFreeOpsHash =
+            getMaxNumOperandsThatCanBeReordered(Lane);
+        // Compare the number of operands that can move and choose the one with
+        // the least number.
+        if (NumFreeOpsHash.first < Min) {
+          Min = NumFreeOpsHash.first;
+          SameOpNumber = NumFreeOpsHash.second;
+          BestLane = Lane;
+        } else if (NumFreeOpsHash.first == Min &&
+                   NumFreeOpsHash.second < SameOpNumber) {
+          // Select the most optimal lane in terms of number of operands that
+          // should be moved around.
+          SameOpNumber = NumFreeOpsHash.second;
           BestLane = Lane;
         }
       }
@@ -1234,9 +1358,11 @@
     }
 
     /// \Returns the maximum number of operands that are allowed to be reordered
-    /// for \p Lane. This is used as a heuristic for selecting the first lane to
-    /// start operand reordering.
-    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+    /// for \p Lane and the number of compatible instructions(with the same
+    /// parent/opcode). This is used as a heuristic for selecting the first lane
+    /// to start operand reordering.
+    std::pair<unsigned, unsigned>
+    getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
       unsigned CntTrue = 0;
       unsigned NumOperands = getNumOperands();
       // Operands with the same APO can be reordered. We therefore need to count
@@ -1245,11 +1371,35 @@
       // a map. Instead we can simply count the number of operands that
       // correspond to one of them (in this case the 'true' APO), and calculate
       // the other by subtracting it from the total number of operands.
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
-        if (getData(OpIdx, Lane).APO)
+      // Operands with the same instruction opcode and parent are more
+      // profitable since we don't need to move them in many cases.
+      bool AllUndefs = true;
+      unsigned SameCodeParentOps = 0;
+      unsigned Opcode = 0;
+      BasicBlock *Parent = nullptr;
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        const OperandData &OpData = getData(OpIdx, Lane);
+        if (OpData.APO)
           ++CntTrue;
+        if (auto *I = dyn_cast<Instruction>(OpData.V)) {
+          if (Opcode != I->getOpcode() || I->getParent() != Parent) {
+            if (SameCodeParentOps == 0) {
+              SameCodeParentOps = 1;
+              Opcode = I->getOpcode();
+              Parent = I->getParent();
+            } else {
+              --SameCodeParentOps;
+            }
+          } else {
+            ++SameCodeParentOps;
+          }
+        }
+        AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
+      }
+      if (AllUndefs)
+        return std::make_pair(UINT_MAX, 0);
       unsigned CntFalse = NumOperands - CntTrue;
-      return std::max(CntTrue, CntFalse);
+      return std::make_pair(std::max(CntTrue, CntFalse), SameCodeParentOps);
     }
 
     /// Go through the instructions in VL and append their operands.
@@ -1257,13 +1407,18 @@
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
-      assert(isa<Instruction>(VL[0]) && "Expected instruction");
-      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      unsigned NumOperands = VL0.getNumOperands();
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
         OpsVec[OpIdx].resize(NumLanes);
         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          if (isa<UndefValue>(VL[Lane])) {
+            OpsVec[OpIdx][Lane] = {
+                UndefValue::get(VL0.getOperand(OpIdx)->getType()), false,
+                false};
+            continue;
+          }
           assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
           // Our tree has just 3 nodes: the root and two operands.
           // It is therefore trivial to get the APO. We only need to check the
@@ -1287,7 +1442,7 @@
     unsigned getNumOperands() const { return OpsVec.size(); }
 
     /// \returns the number of lanes.
-    unsigned getNumLanes() const { return OpsVec[0].size(); }
+    unsigned getNumLanes() const { return NumLanes; }
 
     /// \returns the operand value at \p OpIdx and \p Lane.
     Value *getValue(unsigned OpIdx, unsigned Lane) const {
@@ -1314,7 +1469,7 @@
           OperandData &Data = getData(OpI, Ln);
           if (Data.APO != OpAPO || Data.IsUsed)
             continue;
-          if (Data.V == Op) {
+          if (Data.V == Op || isa<UndefValue>(Op)) {
             FoundCandidate = true;
             Data.IsUsed = true;
             break;
@@ -1328,20 +1483,31 @@
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+    VLOperands(Instruction &VL0, ArrayRef<Value *> RootVL, const DataLayout &DL,
                ScalarEvolution &SE, const BoUpSLP &R)
-        : DL(DL), SE(SE), R(R) {
+        : DL(DL), SE(SE), R(R), VL0(VL0) {
       // Append all the operands of RootVL.
       appendOperandsOfVL(RootVL);
+      // PowerOf2Ceil(distance between the last instrcution and the first
+      // instruction in the array of scalars).
+      NumLanes = PowerOf2Ceil(
+          std::distance(RootVL.begin(), find_if(reverse(RootVL), [](Value *V) {
+                                          return !isa<UndefValue>(V);
+                                        }).base()));
     }
 
     /// \Returns a value vector with the operands across all lanes for the
     /// opearnd at \p OpIdx.
     ValueList getVL(unsigned OpIdx) const {
       ValueList OpVL(OpsVec[OpIdx].size());
-      assert(OpsVec[OpIdx].size() == getNumLanes() &&
+      assert(std::all_of(std::next(OpsVec[OpIdx].begin(), getNumLanes()),
+                         OpsVec[OpIdx].end(),
+                         [](const OperandData &Data) {
+                           return isa<UndefValue>(Data.V);
+                         }) &&
              "Expected same num of lanes across all operands");
-      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
+      for (unsigned Lane = 0, Lanes = OpsVec[OpIdx].size(); Lane != Lanes;
+           ++Lane)
         OpVL[Lane] = OpsVec[OpIdx][Lane].V;
       return OpVL;
     }
@@ -1355,7 +1521,8 @@
       // Each operand has its own mode. We are using this mode to help us select
       // the instructions for each lane, so that they match best with the ones
       // we have selected so far.
-      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands,
+                                                     ReorderingMode::Unknown);
 
       // This is a greedy single-pass algorithm. We are going over each lane
       // once and deciding on the best order right away with no back-tracking.
@@ -1449,6 +1616,8 @@
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
       switch (RMode) {
+      case ReorderingMode::Unknown:
+        return "Unknown";
       case ReorderingMode::Load:
         return "Load";
       case ReorderingMode::Opcode:
@@ -1512,6 +1681,13 @@
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I) const;
 
+  /// Gets most optimial vectorization factor for the tree entry.
+  /// \param UserVFs Vectorization factors of the user nodes.
+  /// \param IE The starting node when trying to get the vectorization factor.
+  /// Required to stop correctly inside of loops, if we have PHI instructions.
+  unsigned getEntryVF(const TreeEntry *E, SmallSet<unsigned, 4> &UserVFs,
+                      const TreeEntry *IE);
+
   /// \returns the cost of the vectorizable entry.
   InstructionCost getEntryCost(TreeEntry *E);
 
@@ -1530,19 +1706,22 @@
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
-  /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  /// Vectorize a single entry in the tree, starting in \p VL and for
+  /// vectorization factor \p VF.
+  Value *vectorizeTree(ArrayRef<Value *> VL, unsigned VF);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
-  InstructionCost
-  getGatherCost(FixedVectorType *Ty,
-                const DenseSet<unsigned> &ShuffledIndices) const;
+  /// \param NeedToShuffle true, if need to shuffle the resulting gather instead
+  /// of inserting same scalars several times.
+  InstructionCost getGatherCost(FixedVectorType *Ty,
+                                const DenseSet<unsigned> &ShuffledIndices,
+                                bool NeedToShuffle) const;
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL, unsigned VF) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
@@ -1557,24 +1736,35 @@
 
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R);
+  static void reorderInputsAccordingToOpcode(
+      Instruction &VL0, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+      SmallVectorImpl<Value *> &Right, const DataLayout &DL,
+      ScalarEvolution &SE, const BoUpSLP &R);
   struct TreeEntry {
     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
     TreeEntry(VecTreeTy &Container) : Container(Container) {}
 
-    /// \returns true if the scalars in VL are equal to this entry.
+    /// \returns true if the scalars in VL are equal to this entry. The scalars
+    /// in VL are equal to this entry if it contains the same scalars(or udefs)
+    /// on the same places.
     bool isSame(ArrayRef<Value *> VL) const {
-      if (VL.size() == Scalars.size())
-        return std::equal(VL.begin(), VL.end(), Scalars.begin());
-      return VL.size() == ReuseShuffleIndices.size() &&
-             std::equal(
-                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
-                 [this](Value *V, int Idx) { return V == Scalars[Idx]; });
+      if (!ReuseShuffleIndices.empty()) {
+        for (int I = 0, E = VL.size(); I < E; ++I) {
+          int Idx = ReuseShuffleIndices[I];
+          if (Idx == E) {
+            if (!isa<UndefValue>(VL[I]))
+              return false;
+            continue;
+          }
+          if (VL[I] != Scalars[Idx] && !isa<UndefValue>(VL[I]))
+            return false;
+        }
+        return true;
+      }
+      for (int I = 0, E = VL.size(); I < E; ++I)
+        if (VL[I] != Scalars[I] && !isa<UndefValue>(VL[I]))
+          return false;
+      return true;
     }
 
     /// A vector of scalars.
@@ -1632,15 +1822,19 @@
     }
 
     /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
+    void setOperandsInOrder(Instruction *I0) {
       assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
       Operands.resize(I0->getNumOperands());
       unsigned NumLanes = Scalars.size();
       for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
            OpIdx != NumOperands; ++OpIdx) {
         Operands[OpIdx].resize(NumLanes);
         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          if (isa<UndefValue>(Scalars[Lane])) {
+            Operands[OpIdx][Lane] =
+                UndefValue::get(I0->getOperand(OpIdx)->getType());
+            continue;
+          }
           auto *I = cast<Instruction>(Scalars[Lane]);
           assert(I->getNumOperands() == NumOperands &&
                  "Expected same number of operands");
@@ -1712,7 +1906,11 @@
     bool updateStateIfReorder() {
       if (ReorderIndices.empty())
         return false;
-      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+      unsigned Size = Scalars.size();
+      InstructionsState S =
+          getSameOpcode(Scalars, *find_if(ReorderIndices, [Size](unsigned Idx) {
+                          return Idx < Size;
+                        }));
       setOperations(S);
       return true;
     }
@@ -1820,8 +2018,9 @@
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     Last->setOperations(S);
+    auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
     if (Last->State != TreeEntry::NeedToGather) {
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
       }
@@ -1833,10 +2032,12 @@
         BundleMember->Lane = Lane;
         ++Lane;
       }
-      assert((!Bundle.getValue() || Lane == VL.size()) &&
+      assert((!Bundle.getValue() ||
+              Lane == std::distance(InstructionsOnly.begin(),
+                                    InstructionsOnly.end())) &&
              "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      MustGather.insert(InstructionsOnly.begin(), InstructionsOnly.end());
     }
 
     if (UserTreeIdx.UserTE)
@@ -1871,9 +2072,18 @@
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
+  /// Vectorization factors for tree entries.
+  SmallDenseMap<const TreeEntry *, unsigned> EntryVFs;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A list of loads to be gathered during the vectorization process. We can
+  /// try to vectorize them at the end, if profitable.
+  SmallVector<LoadInst *, 4> GatheredLoads;
+  /// The index of the first gathered load entry in the VectorizeTree.
+  int GatheredLoadsEntriesFirst = -1;
+
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -2545,6 +2755,91 @@
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
+  // Try to vectorize gathered loads.
+  if (!GatheredLoads.empty() && !isTreeTinyAndNotFullyVectorizable()) {
+    GatheredLoadsEntriesFirst = VectorizableTree.size();
+    SmallDenseMap<LoadInst *, Value *, 8> GatherPointers;
+    for (LoadInst *LI : GatheredLoads)
+      GatherPointers.try_emplace(LI,
+                                 getUnderlyingObject(LI->getPointerOperand()));
+
+    // Sort by type, base pointers and parents.
+    auto &&LoadSorter = [&GatherPointers](LoadInst *V, LoadInst *V2) {
+      return V->getParent() < V2->getParent() ||
+             (V->getParent() == V2->getParent() &&
+              V->getPointerOperand()->getType() <
+                  V2->getPointerOperand()->getType()) ||
+             (V->getParent() == V2->getParent() &&
+              V->getPointerOperand()->getType() ==
+                  V2->getPointerOperand()->getType() &&
+              GatherPointers[V] < GatherPointers[V2]);
+    };
+
+    llvm::stable_sort(GatheredLoads, LoadSorter);
+
+    // Try to vectorize elements based on their types, bases and parents.
+    for (auto IncIt = GatheredLoads.begin(), E = GatheredLoads.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      auto *SameTypeIt = IncIt;
+      Type *EltTy = (*IncIt)->getPointerOperand()->getType();
+      Value *Ptr = GatherPointers[*IncIt];
+
+      SetVector<LoadInst *> Set(IncIt, SameTypeIt);
+      while (SameTypeIt != E &&
+             (*SameTypeIt)->getParent() == (*IncIt)->getParent() &&
+             (*SameTypeIt)->getPointerOperand()->getType() == EltTy &&
+             Ptr == GatherPointers[*SameTypeIt]) {
+        if (!getTreeEntry(*SameTypeIt))
+          Set.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      ArrayRef<LoadInst *> Loads = Set.getArrayRef();
+      int NumElts = Loads.size();
+      if (NumElts >= 3 || (NumElts == 2 && all_of(Loads, [](LoadInst *LI) {
+                             return LI->hasOneUse();
+                           }))) {
+        SmallVector<Value *, 4> Pointers(NumElts);
+        for (int I = 0; I < NumElts; ++I)
+          Pointers[I] = Loads[I]->getPointerOperand();
+        SmallVector<unsigned, 4> SortedIndicies;
+        if (sortPtrAccesses(Pointers, *DL, *SE, SortedIndicies)) {
+          if (SortedIndicies.empty()) {
+            SortedIndicies.assign(NumElts, 0);
+            std::iota(SortedIndicies.begin(), SortedIndicies.end(), 0);
+          }
+          Optional<int> Diff =
+              getPointersDiff(Pointers[SortedIndicies.front()],
+                              Pointers[SortedIndicies.back()], *DL, *SE);
+          int MaxLoads = std::max(getMaxVecRegSize() / DL->getTypeSizeInBits(
+                                                           Loads[0]->getType()),
+                                  Roots.size()) *
+                         (NumElts >= 4 ? 1 : 2);
+          if (Diff && *Diff < MaxLoads) {
+            SmallVector<Value *, 4> Values(
+                PowerOf2Ceil(*Diff + 1), UndefValue::get((*IncIt)->getType()));
+            // Sort loads.
+            Values[0] = Loads[SortedIndicies.front()];
+            for (int I = 1; I < NumElts; ++I) {
+              Optional<int> Diff =
+                  getPointersDiff(Pointers[SortedIndicies.front()],
+                                  Pointers[SortedIndicies[I]], *DL, *SE);
+              Values[*Diff] = Loads[SortedIndicies[I]];
+            }
+            LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
+                              << NumElts << ")\n");
+
+            buildTree_rec(Values, 0, EdgeInfo());
+          }
+        }
+      }
+
+      // Start over at the next instruction of a different type (or the end).
+      IncIt = SameTypeIt;
+    }
+  }
 
   // Collect the values that we need to extract from the tree.
   for (auto &TEPtr : VectorizableTree) {
@@ -2557,6 +2852,8 @@
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
+      if (isa<UndefValue>(Scalar))
+        continue;
       int FoundLane = Lane;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
@@ -2580,9 +2877,12 @@
 
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
+          auto *It = llvm::find_if(UseEntry->Scalars, Instruction::classof);
+          assert(It != UseEntry->Scalars.end() &&
+                 "At least single instruction is expected.");
+          Value *UseScalar = *It;
           // Some in-tree scalars will remain as scalar in vectorized
-          // instructions. If that is the case, the one in Lane 0 will
+          // instructions. If that is the case, the one in the first lane will
           // be used.
           if (UseScalar != U ||
               UseEntry->State == TreeEntry::ScatterVectorize ||
@@ -2606,6 +2906,28 @@
   }
 }
 
+/// Tries to find subvector of loads and builds new vector of only loads if can
+/// be profitable.
+static void
+gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef<Value *> VL,
+                                SmallVectorImpl<LoadInst *> &GatheredLoads) {
+  for (Value *V : VL) {
+    if (auto *LI = dyn_cast<LoadInst>(V))
+      if (!R.isDeleted(LI))
+        GatheredLoads.push_back(LI);
+  }
+}
+
+/// Checks if the mask is uniforms, i.e. consequent and/or with some undefs.
+template <typename T> static bool isUniform(const T &Mask) {
+  for (typename T::value_type I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] != I && Mask[I] != E &&
+        Mask[I] != static_cast<typename T::value_type>(UndefMaskElem))
+      return false;
+  }
+  return true;
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -2631,8 +2953,11 @@
       return;
     }
 
+  auto InitialInstructionsOnly = make_filter_range(VL, Instruction::classof);
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+  if (allConstant(VL) || isSplat(VL) ||
+      !allSameBlock(InitialInstructionsOnly) || !S.getOpcode()) {
+    gatherPossiblyVectorizableLoads(*this, VL, GatheredLoads);
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
@@ -2642,7 +2967,7 @@
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (Value *V : VL) {
+  for (Value *V : InitialInstructionsOnly) {
     if (EphValues.count(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is ephemeral.\n");
@@ -2668,11 +2993,8 @@
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    if (getTreeEntry(I)) {
+  for (Value *V : InitialInstructionsOnly) {
+    if (getTreeEntry(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -2680,11 +3002,9 @@
     }
   }
 
-  // If any of the scalars is marked as a value that needs to stay scalar, then
-  // we need to gather the scalars.
-  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (Value *V : VL) {
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+  // The reduction nodes (stored in UserIgnoreList) should stay scalar.
+  for (Value *V : InitialInstructionsOnly) {
+    if (is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
@@ -2704,29 +3024,56 @@
     return;
   }
 
+  ArrayRef<Value *> OriginalVL = VL;
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
   SmallVector<Value *, 4> UniqueValues;
   DenseMap<Value *, unsigned> UniquePositions;
+  UniqueValues.reserve(VL.size());
+  ReuseShuffleIndicies.reserve(VL.size());
+  unsigned NumberOfInstructions = 0;
+  unsigned UserNumberOfInstructions = 0;
+  if (const TreeEntry *UserTE = UserTreeIdx.UserTE)
+    UserNumberOfInstructions =
+        count_if(UserTE->Scalars, [](Value *V) { return !isa<UndefValue>(V); });
+  unsigned Pos = 0;
   for (Value *V : VL) {
+    if (isa<UndefValue>(V)) {
+      ReuseShuffleIndicies.emplace_back(
+          Pos < UserNumberOfInstructions ? Pos : VL.size());
+      ++Pos;
+      continue;
+    }
     auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
     ReuseShuffleIndicies.emplace_back(Res.first->second);
-    if (Res.second)
+    if (Res.second) {
       UniqueValues.emplace_back(V);
+      ++NumberOfInstructions;
+    }
+    ++Pos;
   }
-  size_t NumUniqueScalarValues = UniqueValues.size();
-  if (NumUniqueScalarValues == VL.size()) {
+  if (NumberOfInstructions == VL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-    if (NumUniqueScalarValues <= 1 ||
-        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
-      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+    if (NumberOfInstructions <= 1) {
+      gatherPossiblyVectorizableLoads(*this, VL, GatheredLoads);
+      LLVM_DEBUG(dbgs() << "SLP: Single scalar in bundle"
+                        << *UniqueValues.front() << ".\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
+    // Check if the reuse shuffle mask is uniform anbd no need to count undefs
+    // as real operands.
+    if ((UserNumberOfInstructions == 0 ||
+         UserNumberOfInstructions == NumberOfInstructions) &&
+        isUniform(ReuseShuffleIndicies))
+      ReuseShuffleIndicies.clear();
+    UniqueValues.append(VL.size() - UniqueValues.size(),
+                        UndefValue::get(VL0->getType()));
     VL = UniqueValues;
   }
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
@@ -2740,8 +3087,7 @@
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                 ReuseShuffleIndicies);
+    newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -2753,17 +3099,16 @@
       auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (Value *V : VL)
+      for (Value *V : InstructionsOnly)
         for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-          Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(V)->getIncomingValueForBlock(
+          auto *Term =
+              dyn_cast<Instruction>(cast<PHINode>(V)->getIncomingValueForBlock(
                   PH->getIncomingBlock(I)));
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                         ReuseShuffleIndicies);
+            newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
             return;
           }
         }
@@ -2778,8 +3123,10 @@
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
-              PH->getIncomingBlock(I)));
+          Operands.emplace_back(
+              isa<UndefValue>(V) ? UndefValue::get(V->getType())
+                                 : cast<PHINode>(V)->getIncomingValueForBlock(
+                                       PH->getIncomingBlock(I)));
         TE->setOperand(I, Operands);
         OperandsVec.push_back(Operands);
       }
@@ -2815,8 +3162,13 @@
         // otherwise return the iterator to the existing one.
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies, CurrentOrder);
-        findRootOrder(CurrentOrder);
-        ++NumOpsWantToKeepOrder[CurrentOrder];
+        // No need to reorder if still need to shuffle reuses.
+        if (ReuseShuffleIndicies.empty()) {
+          findRootOrder(CurrentOrder);
+          ++NumOpsWantToKeepOrder[CurrentOrder];
+        } else {
+          ++NumOpsWantToKeepOriginalOrder;
+        }
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -2825,8 +3177,7 @@
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+      newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
       BS.cancelScheduling(VL, VL0);
       return;
     }
@@ -2842,27 +3193,31 @@
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+        newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
-      auto POIter = PointerOps.begin();
-      for (Value *V : VL) {
-        auto *L = cast<LoadInst>(V);
+      SmallVector<Value *, 4> PointerOps(NumberOfInstructions);
+      OrdersType OriginalOrder(NumberOfInstructions, 0);
+      auto *POIter = PointerOps.begin();
+      auto *OOIter = OriginalOrder.begin();
+      for (int I = 0, E = VL.size(); I < E; ++I) {
+        if (isa<UndefValue>(VL[I]))
+          continue;
+        auto *L = cast<LoadInst>(VL[I]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
         *POIter = L->getPointerOperand();
         ++POIter;
+        *OOIter = I;
+        ++OOIter;
       }
 
       OrdersType CurrentOrder;
@@ -2879,39 +3234,93 @@
         }
         Optional<int> Diff = getPointersDiff(Ptr0, PtrN, *DL, *SE);
         // Check that the sorted loads are consecutive.
-        if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
+        int AcceptableDiff = NumberOfInstructions - 1;
+        Align CommonAlign = cast<LoadInst>(VL0)->getAlign();
+        if (!CurrentOrder.empty())
+          CommonAlign = cast<LoadInst>(VL[OriginalOrder[CurrentOrder.front()]])
+                            ->getAlign();
+        unsigned Sz = DL->getTypeStoreSize(ScalarTy);
+        if (Diff && *Diff >= AcceptableDiff &&
+            *Diff <= static_cast<int>(VL.size() - 1) &&
+            (TTI->isLegalMaskedLoad(
+                 FixedVectorType::get(ScalarTy, PowerOf2Ceil(*Diff + 1)),
+                 CommonAlign) ||
+             isPowerOf2_32(
+                 std::min(PowerOf2Ceil(*Diff + 1),
+                          alignTo((*Diff + 1) * Sz, CommonAlign) / Sz)))) {
           if (CurrentOrder.empty()) {
-            // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
-                                         UserTreeIdx, ReuseShuffleIndicies);
-            TE->setOperandsInOrder();
+            if (*Diff == AcceptableDiff && isUniform(OriginalOrder)) {
+              // Original loads are consecutive and do not require reordering.
+              TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                           UserTreeIdx, ReuseShuffleIndicies);
+              TE->setOperandsInOrder(VL0);
+            } else {
+              OrdersType NormalizedOrder(VL.size(), VL.size());
+              for (int I = 0, E = OriginalOrder.size(); I < E; ++I) {
+                NormalizedOrder[*getPointersDiff(Ptr0, PointerOps[I], *DL,
+                                                 *SE)] = OriginalOrder[I];
+              }
+              // Need to extend.
+              TreeEntry *TE =
+                  newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                               ReuseShuffleIndicies, NormalizedOrder);
+              TE->setOperandsInOrder(VL0);
+            }
+            // Count orders of non-gathered loads only.
+            if ((UserTreeIdx.UserTE || Depth == 0) &&
+                !all_of(InstructionsOnly,
+                        [this](Value *V) { return MustGather.contains(V); }))
+              ++NumOpsWantToKeepOriginalOrder;
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
+            OrdersType NormalizedOrder(VL.size(), VL.size());
+            SmallVector<int, 4> Orders(CurrentOrder.size());
+            inversePermutation(CurrentOrder, Orders);
+            for (int I = 0, E = CurrentOrder.size(); I < E; ++I) {
+              NormalizedOrder[*getPointersDiff(Ptr0, PointerOps[Orders[I]], *DL,
+                                               *SE)] = OriginalOrder[Orders[I]];
+            }
             // Need to reorder.
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder();
+                             ReuseShuffleIndicies, NormalizedOrder);
+            TE->setOperandsInOrder(VL0);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            // No need to reorder if still need to shuffle reuses.
+            if (ReuseShuffleIndicies.empty()) {
+              findRootOrder(NormalizedOrder);
+              ++NumOpsWantToKeepOrder[NormalizedOrder];
+            } else {
+              ++NumOpsWantToKeepOriginalOrder;
+            }
           }
           return;
         }
-        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
-                                     UserTreeIdx, ReuseShuffleIndicies);
-        TE->setOperandsInOrder();
-        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
-        return;
+        Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+        for (Value *V : InstructionsOnly)
+          CommonAlignment =
+              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+        if (TTI->isLegalMaskedGather(
+                FixedVectorType::get(ScalarTy,
+                                     PowerOf2Ceil(NumberOfInstructions)),
+                CommonAlignment)) {
+          // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+          TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,
+                                       S, UserTreeIdx, ReuseShuffleIndicies);
+          TE->setOperandsInOrder(VL0);
+          PointerOps.append(
+              VL.size() - NumberOfInstructions,
+              UndefValue::get(cast<LoadInst>(VL0)->getPointerOperandType()));
+          buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+          LLVM_DEBUG(dbgs()
+                     << "SLP: added a vector of non-consecutive loads.\n");
+          return;
+        }
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+      newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
       return;
     }
     case Instruction::ZExt:
@@ -2927,12 +3336,11 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
@@ -2942,12 +3350,14 @@
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(isa<UndefValue>(V)
+                                 ? UndefValue::get(SrcTy)
+                                 : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2959,13 +3369,12 @@
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
-        CmpInst *Cmp = cast<CmpInst>(V);
+      for (Value *V : InstructionsOnly) {
+        auto *Cmp = cast<CmpInst>(V);
         if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
@@ -2981,10 +3390,15 @@
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == SwapP0 && "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
+          if (isa<UndefValue>(V)) {
+            Left.push_back(UndefValue::get(VL0->getOperand(0)->getType()));
+            Right.push_back(UndefValue::get(VL0->getOperand(1)->getType()));
+            continue;
+          }
           auto *Cmp = cast<CmpInst>(V);
           Value *LHS = Cmp->getOperand(0);
           Value *RHS = Cmp->getOperand(1);
@@ -3028,7 +3442,7 @@
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
         TE->setOperand(0, Left);
         TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -3036,25 +3450,54 @@
         return;
       }
 
-      TE->setOperandsInOrder();
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      SmallVector<ValueList, 2> OperandsVec;
+      for (unsigned I = 0, E = VL0->getNumOperands(); I < E; ++I) {
         ValueList Operands;
+        Value *DefinedOp = nullptr;
+        // Cannot use undef for int div/rem, use the last real value instead.
+        if (BinaryOperator::isIntDivRem(ShuffleOrOp)) {
+          const auto *It = find_if(VL, [I](Value *V) {
+            return isa<Instruction>(V) &&
+                   !isa<UndefValue>(cast<Instruction>(V)->getOperand(I));
+          });
+          if (It != VL.end())
+            DefinedOp = cast<Instruction>(*It)->getOperand(I);
+        }
         // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+        for (Value *V : VL.slice(
+                 0, PowerOf2Ceil(std::distance(
+                        VL.begin(),
+                        find_if(reverse(VL), Instruction::classof).base())))) {
+          Value *OpV;
+          if (isa<UndefValue>(V)) {
+            if (BinaryOperator::isIntDivRem(ShuffleOrOp) && DefinedOp)
+              OpV = DefinedOp;
+            else
+              OpV = UndefValue::get(VL0->getOperand(I)->getType());
+          } else {
+            OpV = cast<Instruction>(V)->getOperand(I);
+            if (isa<UndefValue>(OpV) &&
+                BinaryOperator::isIntDivRem(ShuffleOrOp) && DefinedOp)
+              OpV = DefinedOp;
+          }
+          Operands.push_back(OpV);
+        }
+        Operands.append(VL.size() - Operands.size(),
+                        UndefValue::get(VL0->getOperand(I)->getType()));
+        TE->setOperand(I, Operands);
+        OperandsVec.push_back(Operands);
       }
+      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
       return;
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         if (cast<Instruction>(V)->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           return;
         }
       }
@@ -3062,22 +3505,21 @@
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
       Type *Ty1 = VL0->getOperand(1)->getType();
-      for (Value *V : VL) {
-        auto Op = cast<Instruction>(V)->getOperand(1);
+      for (Value *V : InstructionsOnly) {
+        auto *Op = cast<Instruction>(V)->getOperand(1);
         if (!isa<ConstantInt>(Op) ||
             (Op->getType() != Ty1 &&
              Op->getType()->getScalarSizeInBits() >
@@ -3086,8 +3528,7 @@
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           return;
         }
       }
@@ -3095,12 +3536,15 @@
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(
+              isa<UndefValue>(V)
+                  ? UndefValue::get(VL0->getOperand(i)->getType())
+                  : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -3114,72 +3558,107 @@
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+        newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
         LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
         return;
       }
       // Make sure all stores in the bundle are simple - we can't vectorize
       // atomic or volatile stores.
-      SmallVector<Value *, 4> PointerOps(VL.size());
+      SmallVector<Value *, 4> PointerOps(NumberOfInstructions);
+      OrdersType OriginalOrder(NumberOfInstructions, 0);
       ValueList Operands(VL.size());
       auto POIter = PointerOps.begin();
       auto OIter = Operands.begin();
-      for (Value *V : VL) {
-        auto *SI = cast<StoreInst>(V);
+      auto *OOIter = OriginalOrder.begin();
+      for (int I = 0, E = VL.size(); I < E; ++I) {
+        if (isa<UndefValue>(VL[I])) {
+          *OIter = UndefValue::get(VL0->getOperand(0)->getType());
+          ++OIter;
+          continue;
+        }
+        auto *SI = cast<StoreInst>(VL[I]);
         if (!SI->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
           return;
         }
         *POIter = SI->getPointerOperand();
         *OIter = SI->getValueOperand();
+        *OOIter = I;
         ++POIter;
         ++OIter;
+        ++OOIter;
       }
 
       OrdersType CurrentOrder;
+      if (!llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
+        LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+        return;
+      }
       // Check the order of pointer operands.
-      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
-        Value *Ptr0;
-        Value *PtrN;
+      Value *Ptr0;
+      Value *PtrN;
+      if (CurrentOrder.empty()) {
+        Ptr0 = PointerOps.front();
+        PtrN = PointerOps.back();
+      } else {
+        Ptr0 = PointerOps[CurrentOrder.front()];
+        PtrN = PointerOps[CurrentOrder.back()];
+      }
+      Optional<int> Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE);
+      // Check that the sorted pointer operands are consecutive.
+      int NormalizedSize = NumberOfInstructions - 1;
+      if (Dist && *Dist >= NormalizedSize &&
+          *Dist <= static_cast<int>(VL.size() - 1)) {
         if (CurrentOrder.empty()) {
-          Ptr0 = PointerOps.front();
-          PtrN = PointerOps.back();
-        } else {
-          Ptr0 = PointerOps[CurrentOrder.front()];
-          PtrN = PointerOps[CurrentOrder.back()];
-        }
-        Optional<int> Dist = getPointersDiff(Ptr0, PtrN, *DL, *SE);
-        // Check that the sorted pointer operands are consecutive.
-        if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
-          if (CurrentOrder.empty()) {
+          TreeEntry *TE;
+          if (NumberOfInstructions == VL.size() && isUniform(OriginalOrder)) {
             // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
-                                         UserTreeIdx, ReuseShuffleIndicies);
-            TE->setOperandsInOrder();
-            buildTree_rec(Operands, Depth + 1, {TE, 0});
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+            TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                              ReuseShuffleIndicies);
           } else {
-            TreeEntry *TE =
-                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder();
-            buildTree_rec(Operands, Depth + 1, {TE, 0});
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            // Need to extend.
+            OrdersType NormalizedOrder(VL.size(), VL.size());
+            for (int I = 0, E = OriginalOrder.size(); I < E; ++I) {
+              NormalizedOrder[*getPointersDiff(Ptr0, PointerOps[I], *DL, *SE)] =
+                  OriginalOrder[I];
+            }
+            TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                              ReuseShuffleIndicies, NormalizedOrder);
+          }
+          TE->setOperandsInOrder(VL0);
+          buildTree_rec(Operands, Depth + 1, {TE, 0});
+          ++NumOpsWantToKeepOriginalOrder;
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+        } else {
+          OrdersType NormalizedOrder(VL.size(), VL.size());
+          SmallVector<int, 4> Orders(CurrentOrder.size());
+          inversePermutation(CurrentOrder, Orders);
+          for (int I = 0, E = CurrentOrder.size(); I < E; ++I) {
+            NormalizedOrder[*getPointersDiff(Ptr0, PointerOps[Orders[I]], *DL,
+                                             *SE)] = OriginalOrder[Orders[I]];
+          }
+          TreeEntry *TE =
+              newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                           ReuseShuffleIndicies, NormalizedOrder);
+          TE->setOperandsInOrder(VL0);
+          buildTree_rec(Operands, Depth + 1, {TE, 0});
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+          // No need to reorder if still need to shuffle reuses.
+          if (ReuseShuffleIndicies.empty()) {
+            findRootOrder(NormalizedOrder);
+            ++NumOpsWantToKeepOrder[NormalizedOrder];
+          } else {
+            ++NumOpsWantToKeepOriginalOrder;
           }
-          return;
         }
+        return;
       }
-
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+      newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
       LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
       return;
     }
@@ -3189,15 +3668,16 @@
       CallInst *CI = cast<CallInst>(VL0);
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-      VFShape Shape = VFShape::get(
-          *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
-          false /*HasGlobalPred*/);
+      VFShape Shape =
+          VFShape::get(*CI,
+                       ElementCount::getFixed(static_cast<unsigned int>(
+                           PowerOf2Ceil(NumberOfInstructions))),
+                       false /*HasGlobalPred*/);
       Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
 
       if (!VecFunc && !isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+        newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -3208,6 +3688,8 @@
         if (hasVectorInstrinsicScalarOpd(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
       for (Value *V : VL) {
+        if (isa<UndefValue>(V))
+          continue;
         CallInst *CI2 = dyn_cast<CallInst>(V);
         if (!CI2 || CI2->getCalledFunction() != F ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
@@ -3215,10 +3697,9 @@
              VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
-                            << "\n");
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
+          LLVM_DEBUG(dbgs()
+                     << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n");
           return;
         }
         // Some intrinsics have scalar arguments and should be same in order for
@@ -3228,8 +3709,7 @@
             Value *A1J = CI2->getArgOperand(j);
             if (ScalarArgs[j] != A1J) {
               BS.cancelScheduling(VL, VL0);
-              newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                           ReuseShuffleIndicies);
+              newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
               LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                                 << " argument " << ScalarArgs[j] << "!=" << A1J
                                 << "\n");
@@ -3243,21 +3723,30 @@
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+          newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
                             << *CI << "!=" << *V << '\n');
           return;
         }
       }
+      SmallVector<Value *, 4> NormalizedCalls(VL.size(),
+                                              UndefValue::get(CI->getType()));
+      copy(VL, NormalizedCalls.begin());
+      for (int I = NumberOfInstructions, E = PowerOf2Ceil(NumberOfInstructions);
+           I < E; ++I)
+        NormalizedCalls[I] = CI;
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *V : VL) {
+        for (Value *V : NormalizedCalls) {
+          if (isa<UndefValue>(V)) {
+            Operands.push_back(UndefValue::get(CI->getOperand(i)->getType()));
+            continue;
+          }
           auto *CI2 = cast<CallInst>(V);
           Operands.push_back(CI2->getArgOperand(i));
         }
@@ -3270,8 +3759,7 @@
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+        newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
@@ -3282,7 +3770,7 @@
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
         TE->setOperand(0, Left);
         TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -3290,12 +3778,15 @@
         return;
       }
 
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(
+              isa<UndefValue>(V)
+                  ? UndefValue::get(VL0->getOperand(i)->getType())
+                  : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -3303,8 +3794,7 @@
     }
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+      newTreeEntry(OriginalVL, None /*not vectorized*/, S, UserTreeIdx);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -3368,39 +3858,43 @@
     NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   }
 
-  if (NElts != VL.size())
-    return false;
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
+  const unsigned NumOfInstructions =
+      std::distance(InstructionsOnly.begin(), InstructionsOnly.end());
 
   // Check that all of the indices extract from the correct offset.
   bool ShouldKeepOrder = true;
   unsigned E = VL.size();
-  // Assign to all items the initial value E + 1 so we can check if the extract
+  // Assign to all items the initial value E so we can check if the extract
   // instruction index was used already.
   // Also, later we can check that all the indices are used and we have a
   // consecutive access in the extract instructions, by checking that no
-  // element of CurrentOrder still has value E + 1.
-  CurrentOrder.assign(E, E + 1);
+  // element of CurrentOrder still has value E.
+  CurrentOrder.assign(E, E);
   unsigned I = 0;
-  for (; I < E; ++I) {
-    auto *Inst = cast<Instruction>(VL[I]);
+  auto II = InstructionsOnly.begin();
+  for (; I < NumOfInstructions; ++I, ++II) {
+    auto *Inst = cast<Instruction>(*II);
     if (Inst->getOperand(0) != Vec)
       break;
     Optional<unsigned> Idx = getExtractIndex(Inst);
     if (!Idx)
       break;
     const unsigned ExtIdx = *Idx;
+    if (ExtIdx >= E)
+      break;
     if (ExtIdx != I) {
-      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+      if (CurrentOrder[ExtIdx] != E)
         break;
       ShouldKeepOrder = false;
       CurrentOrder[ExtIdx] = I;
     } else {
-      if (CurrentOrder[I] != E + 1)
+      if (CurrentOrder[I] != E)
         break;
       CurrentOrder[I] = I;
     }
   }
-  if (I < E) {
+  if (I < NumOfInstructions) {
     CurrentOrder.clear();
     return false;
   }
@@ -3447,30 +3941,139 @@
   return {IntrinsicCost, LibCost};
 }
 
+/// Returns the indecies for the first and the last instructions based on
+/// ordering.
+static std::pair<unsigned, unsigned>
+findMinMaxPos(ArrayRef<unsigned> ReorderedIndicies) {
+  unsigned E = ReorderedIndicies.size();
+  unsigned Min = E;
+  unsigned Max = E;
+  for (unsigned I = 0; I < E && (Min == E || Max == E); ++I) {
+    if (Min == E && ReorderedIndicies[I] < E)
+      Min = I;
+    if (Max == E && ReorderedIndicies[E - 1 - I] < E)
+      Max = E - 1 - I;
+  }
+  return std::make_pair(Min, Max);
+}
+
+unsigned BoUpSLP::getEntryVF(const TreeEntry *E, SmallSet<unsigned, 4> &UserVFs,
+                             const TreeEntry *IE) {
+  auto It = EntryVFs.find(E);
+  if (It != EntryVFs.end())
+    return It->second;
+  auto &&GetVF = [](ArrayRef<Value *> Scalars,
+                    ArrayRef<unsigned> ReorderIndices,
+                    unsigned Opcode) -> unsigned {
+    // For stores, the vectorization factor is the number of scalars, it is
+    // aligned to the minimal/maximal size of the vector register.
+    if (Opcode == Instruction::Store)
+      return Scalars.size();
+    unsigned NumValues =
+        std::distance(Scalars.begin(), find_if(reverse(Scalars), [](Value *V) {
+                                         return !isa<UndefValue>(V);
+                                       }).base());
+    if (!ReorderIndices.empty()) {
+      unsigned MinPos, MaxPos;
+      std::tie(MinPos, MaxPos) = findMinMaxPos(ReorderIndices);
+      NumValues = std::max(NumValues, MaxPos + 1);
+    }
+
+    return PowerOf2Ceil(NumValues);
+  };
+  unsigned SelfVF = GetVF(E->Scalars, E->ReorderIndices, E->getOpcode());
+  bool IsGather = E->State == TreeEntry::NeedToGather;
+  EntryVFs.try_emplace(E, IsGather ? 0 : std::min<unsigned>(2, SelfVF));
+  unsigned MinVF = E->Scalars.size();
+  // Fill users vectorization factors to calculate shuffle cost correctly.
+  for (const EdgeInfo &EI : E->UserTreeIndices) {
+    if (!EI.UserTE || EI.UserTE == IE)
+      continue;
+    SmallSet<unsigned, 4> UserUserVFs;
+    if (unsigned UserVF = getEntryVF(EI.UserTE, UserUserVFs, IE)) {
+      UserVFs.insert(UserVF);
+      MinVF = std::max(std::min(MinVF, UserVF), SelfVF);
+    }
+  }
+  if (SelfVF <= 1 ||
+      (!IsGather && E->getNumOperands() < 1 && !UserVFs.contains(SelfVF)))
+    SelfVF = std::max<unsigned>(2, MinVF);
+  if (IsGather && SelfVF < MinVF)
+    SelfVF = MinVF;
+  EntryVFs[E] = SelfVF;
+  return SelfVF;
+}
+
 InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
+  SmallSet<unsigned, 4> UserVFs;
+  // Original vectorization factor.
+  unsigned SelfVF = getEntryVF(E, UserVFs, E);
+  unsigned ShuffleVF = SelfVF;
+  // Final vectorization factor after shuffling reuses.
+  if (!E->ReuseShuffleIndices.empty()) {
+    int Limit = VL.size();
+    ShuffleVF = std::max<unsigned>(
+        SelfVF, PowerOf2Ceil(std::distance(
+                    E->ReuseShuffleIndices.begin(),
+                    find_if(reverse(E->ReuseShuffleIndices), [Limit](int I) {
+                      return I < Limit;
+                    }).base())));
+  }
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
+  const unsigned NumOfInstructions =
+      std::distance(InstructionsOnly.begin(), InstructionsOnly.end());
+  Value *V0;
   Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  FixedVectorType *VecTy;
+  FixedVectorType *FinalVecTy;
+  if (!llvm::empty(InstructionsOnly)) {
+    V0 = *InstructionsOnly.begin();
+    if (StoreInst *SI = dyn_cast<StoreInst>(V0))
+      ScalarTy = SI->getValueOperand()->getType();
+    else if (CmpInst *CI = dyn_cast<CmpInst>(V0))
+      ScalarTy = CI->getOperand(0)->getType();
+
+    // If we have computed a smaller type for the expression, update VecTy so
+    // that the costs will be accurate.
+    auto MinBWI = MinBWs.find(V0);
+    if (MinBWI != MinBWs.end()) {
+      VecTy = FixedVectorType::get(
+          IntegerType::get(F->getContext(), MinBWI->second.first), SelfVF);
+      FinalVecTy = FixedVectorType::get(
+          IntegerType::get(F->getContext(), MinBWI->second.first), ShuffleVF);
+    } else {
+      VecTy = FixedVectorType::get(ScalarTy, SelfVF);
+      FinalVecTy = FixedVectorType::get(ScalarTy, ShuffleVF);
+    }
+  } else {
+    VecTy = FixedVectorType::get(ScalarTy, SelfVF);
+    FinalVecTy = FixedVectorType::get(ScalarTy, ShuffleVF);
+  }
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  // If we have computed a smaller type for the expression, update VecTy so
-  // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
-    VecTy = FixedVectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   InstructionCost ReuseShuffleCost = 0;
   if (NeedToShuffleReuses) {
     ReuseShuffleCost =
-        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
-                            E->ReuseShuffleIndices);
+        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                            FinalVecTy, E->ReuseShuffleIndices);
+  }
+  for (unsigned UserVF : UserVFs) {
+    if (UserVF == ShuffleVF)
+      continue;
+    if (UserVF > ShuffleVF) {
+      ReuseShuffleCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                              FixedVectorType::get(ScalarTy, UserVF), None,
+                              /*Index=*/0, FinalVecTy);
+    } else {
+      ReuseShuffleCost += TTI->getShuffleCost(
+          TargetTransformInfo::SK_ExtractSubvector, FinalVecTy, None,
+          /*Index=*/0, FixedVectorType::get(ScalarTy, UserVF));
+    }
   }
   if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
@@ -3480,36 +4083,42 @@
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
                                  0);
     }
-    if (E->getOpcode() == Instruction::ExtractElement &&
-        allSameType(VL) && allSameBlock(VL)) {
+    if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
+        allSameBlock(InstructionsOnly)) {
       SmallVector<int> Mask;
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
-          isShuffle(VL, Mask);
-      if (ShuffleKind.hasValue()) {
+          NumOfInstructions > 1
+              ? isShuffle(llvm::to_vector<4>(InstructionsOnly), Mask)
+              : None;
+      if (NumOfInstructions == 1 || ShuffleKind) {
         InstructionCost Cost =
-            TTI->getShuffleCost(ShuffleKind.getValue(), VecTy, Mask);
-        for (auto *V : VL) {
+            NumOfInstructions > 1
+                ? TTI->getShuffleCost(*ShuffleKind, VecTy, Mask)
+                : 0;
+        for (Value *V : InstructionsOnly) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
               !ScalarToTreeEntry.count(V)) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
-            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+            auto *EE = cast<ExtractElementInst>(V);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
+            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                            EE->getVectorOperandType(),
                                             IO->getZExtValue());
           }
         }
         return ReuseShuffleCost + Cost;
       }
     }
-    return ReuseShuffleCost + getGatherCost(VL);
+    return ReuseShuffleCost + getGatherCost(VL, SelfVF);
   }
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
-  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  assert(E->getOpcode() && allSameType(VL) && allSameBlock(InstructionsOnly) &&
+         "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -3525,12 +4134,14 @@
       if (NeedToShuffleReuses) {
         unsigned Idx = 0;
         for (unsigned I : E->ReuseShuffleIndices) {
+          if (I >= VL.size() || isa<UndefValue>(VL[I]))
+            continue;
           if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+            auto *EE = cast<ExtractElementInst>(VL[I]);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
             Idx = IO->getZExtValue();
             ReuseShuffleCost -= TTI->getVectorInstrCost(
-                Instruction::ExtractElement, VecTy, Idx);
+                Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
           } else {
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
@@ -3538,16 +4149,18 @@
           }
         }
         Idx = ReuseShuffleNumbers;
-        for (Value *V : VL) {
+        for (Value *V : InstructionsOnly) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+            auto *EE = cast<ExtractElementInst>(V);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
             Idx = IO->getZExtValue();
+            ReuseShuffleCost += TTI->getVectorInstrCost(
+                Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
           } else {
             --Idx;
+            ReuseShuffleCost += TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
           }
-          ReuseShuffleCost +=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
         CommonCost = ReuseShuffleCost;
       } else if (!E->ReorderIndices.empty()) {
@@ -3556,29 +4169,44 @@
         CommonCost = TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
       }
+#ifndef NDEBUG
+      OrdersType CurrentOrder;
+      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      assert(Reuse && E->ReorderIndices.empty() ||
+             (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() &&
+              std::equal(CurrentOrder.begin(), CurrentOrder.end(),
+                         E->ReorderIndices.begin())) &&
+                 "The sequence of extract elements must be reused or shuffled "
+                 "with the same mask.");
+#endif
       for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-        Instruction *EI = cast<Instruction>(VL[I]);
-        // If all users are going to be vectorized, instruction can be
-        // considered as dead.
-        // The same, if have only one user, it will be vectorized for sure.
-        if (areAllUsersVectorized(EI)) {
-          // Take credit for instruction that will become dead.
-          if (EI->hasOneUse()) {
-            Instruction *Ext = EI->user_back();
-            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
-                all_of(Ext->users(),
-                       [](User *U) { return isa<GetElementPtrInst>(U); })) {
-              // Use getExtractWithExtendCost() to calculate the cost of
-              // extractelement/ext pair.
-              CommonCost -= TTI->getExtractWithExtendCost(
-                  Ext->getOpcode(), Ext->getType(), VecTy, I);
-              // Add back the cost of s|zext which is subtracted separately.
-              CommonCost += TTI->getCastInstrCost(
-                  Ext->getOpcode(), Ext->getType(), EI->getType(),
-                  TTI::getCastContextHint(Ext), CostKind, Ext);
-              continue;
-            }
+        if (isa<UndefValue>(VL[I]))
+          continue;
+        auto *EI = cast<Instruction>(VL[I]);
+        // Take credit for instruction that will become dead.
+        if (EI->hasOneUse()) {
+          Instruction *Ext = EI->user_back();
+          if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+              all_of(Ext->users(),
+                     [](User *U) { return isa<GetElementPtrInst>(U); })) {
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            CommonCost -= TTI->getExtractWithExtendCost(
+                Ext->getOpcode(), Ext->getType(), VecTy, I);
+            // Add back the cost of s|zext which is subtracted separately.
+            CommonCost += TTI->getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EI->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
           }
+        }
+        if (ShuffleOrOp == Instruction::ExtractElement) {
+          auto *EE = cast<ExtractElementInst>(EI);
+          auto *IO = cast<ConstantInt>(EE->getIndexOperand());
+          unsigned Idx = IO->getZExtValue();
+          CommonCost -= TTI->getVectorInstrCost(
+              Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
+        } else {
           CommonCost -=
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
         }
@@ -3602,13 +4230,14 @@
           TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
                                 TTI::getCastContextHint(VL0), CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      InstructionCost ScalarCost = VL.size() * ScalarEltCost;
+      InstructionCost ScalarCost = NumOfInstructions * ScalarEltCost;
 
-      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
+      auto *SrcVecTy = FixedVectorType::get(SrcTy, SelfVF);
       InstructionCost VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
@@ -3628,10 +4257,11 @@
           TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), SelfVF);
+      InstructionCost ScalarCost = NumOfInstructions * ScalarEltCost;
 
       // Check if all entries in VL are either compares or selects with compares
       // as condition that have the same predicates.
@@ -3707,24 +4337,27 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
+      Constant *C0 = nullptr;
       for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        if (isa<UndefValue>(VL[i]))
+          continue;
         const Instruction *I = cast<Instruction>(VL[i]);
         unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
         ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
-        if (!CInt) {
+        Constant *UV = dyn_cast<UndefValue>(I->getOperand(OpIdx));
+        if (!CInt && !UV) {
           Op2VK = TargetTransformInfo::OK_AnyValue;
           Op2VP = TargetTransformInfo::OP_None;
           break;
         }
         if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
+            (UV || !cast<ConstantInt>(CInt)->getValue().isPowerOf2()))
           Op2VP = TargetTransformInfo::OP_None;
         if (i == 0) {
-          CInt0 = CInt;
+          C0 = CInt ? CInt : UV;
           continue;
         }
-        if (CInt0 != CInt)
+        if (C0 != (CInt ? CInt : UV))
           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
@@ -3733,9 +4366,10 @@
           TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarCost = NumOfInstructions * ScalarEltCost;
       InstructionCost VecCost =
           TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
@@ -3751,9 +4385,10 @@
       InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
           Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarCost = NumOfInstructions * ScalarEltCost;
       InstructionCost VecCost = TTI->getArithmeticInstrCost(
           Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
@@ -3765,20 +4400,75 @@
       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
           Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarLdCost = NumOfInstructions * ScalarEltCost;
+
       InstructionCost VecLdCost;
+      bool ShuffledLoadInstructions = false;
       if (E->State == TreeEntry::Vectorize) {
-        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
-                                         CostKind, VL0);
+        unsigned MinIdx;
+        unsigned MaxIdx;
+        if (E->ReorderIndices.empty()) {
+          MinIdx = std::distance(VL.begin(), find_if(VL, Instruction::classof));
+          MaxIdx =
+              std::distance(VL.begin(),
+                            find_if(reverse(VL), Instruction::classof).base()) -
+              1;
+        } else {
+          std::tie(MinIdx, MaxIdx) = findMinMaxPos(E->ReorderIndices);
+        }
+        Align CommonAlign;
+        if (E->ReorderIndices.empty())
+          CommonAlign = alignment;
+        else
+          CommonAlign =
+              cast<LoadInst>(VL[E->ReorderIndices[MinIdx]])->getAlign();
+        unsigned InstrDist = MaxIdx - MinIdx + 1;
+        unsigned Sz = DL->getTypeStoreSize(ScalarTy);
+        // Check if we can use load instead of masked load, i.e. we can directly
+        // load aligned data.
+        unsigned AlignedInstrDist = std::min(
+            PowerOf2Ceil(InstrDist), alignTo(InstrDist * Sz, CommonAlign) / Sz);
+        if (isPowerOf2_32(AlignedInstrDist)) {
+          CommonAlign =
+              commonAlignment(CommonAlign, CommonAlign.value() -
+                                               (AlignedInstrDist - InstrDist));
+          auto *LoadVecTy = VecTy;
+          if (AlignedInstrDist != SelfVF)
+            LoadVecTy = FixedVectorType::get(ScalarTy, AlignedInstrDist);
+          VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy,
+                                           CommonAlign, 0, CostKind, VL0);
+          if (!NeedToShuffleReuses && AlignedInstrDist != SelfVF) {
+            VecLdCost += TTI->getShuffleCost(
+                TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+            ShuffledLoadInstructions = true;
+          }
+        } else {
+          VecLdCost = TTI->getMaskedMemoryOpCost(Instruction::Load, VecTy,
+                                                 alignment, 0, CostKind);
+        }
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+        Align CommonAlignment = alignment;
+        for (Value *V : InstructionsOnly)
+          CommonAlignment =
+              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+        unsigned NormalizedSz = llvm::PowerOf2Ceil(NumOfInstructions);
         VecLdCost = TTI->getGatherScatterOpCost(
-            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
-            /*VariableMask=*/false, alignment, CostKind, VL0);
+            Instruction::Load, FixedVectorType::get(ScalarTy, NormalizedSz),
+            cast<LoadInst>(VL0)->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
+        // Cost of resizing the loaded elements to the size of the vector.
+        if (!NeedToShuffleReuses && NormalizedSz != SelfVF) {
+          VecLdCost = TTI->getShuffleCost(
+              TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+          ShuffledLoadInstructions = true;
+        }
       }
-      if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) {
+      if (!NeedToShuffleReuses && !E->ReorderIndices.empty() &&
+          !ShuffledLoadInstructions) {
         SmallVector<int> NewMask;
         inversePermutation(E->ReorderIndices, NewMask);
         VecLdCost += TTI->getShuffleCost(
@@ -3790,19 +4480,41 @@
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       bool IsReorder = !E->ReorderIndices.empty();
-      auto *SI =
-          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+      auto *SI = cast<StoreInst>(VL0);
       Align Alignment = SI->getAlign();
       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
           Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
-      InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecStCost = TTI->getMemoryOpCost(
-          Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
-      if (IsReorder) {
-        SmallVector<int> NewMask;
-        inversePermutation(E->ReorderIndices, NewMask);
-        VecStCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
+      InstructionCost ScalarStCost = NumOfInstructions * ScalarEltCost;
+      InstructionCost VecStCost;
+      unsigned MinIdx;
+      unsigned MaxIdx;
+      if (E->ReorderIndices.empty()) {
+        MinIdx = std::distance(VL.begin(), find_if(VL, Instruction::classof));
+        MaxIdx =
+            std::distance(VL.begin(),
+                          find_if(reverse(VL), Instruction::classof).base()) -
+            1;
+      } else {
+        std::tie(MinIdx, MaxIdx) = findMinMaxPos(E->ReorderIndices);
+      }
+      if (NumOfInstructions != SelfVF) {
+        VecStCost = TTI->getMaskedMemoryOpCost(Instruction::Store, VecTy,
+                                               Alignment, 0, CostKind);
+        if (IsReorder) {
+          SmallVector<int> NewMask;
+          inversePermutation(E->ReorderIndices, NewMask);
+          VecStCost += TTI->getShuffleCost(
+              TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
+        }
+      } else {
+        VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment,
+                                         0, CostKind, VL0);
+        if (IsReorder) {
+          SmallVector<int> NewMask;
+          inversePermutation(E->ReorderIndices, NewMask);
+          VecStCost += TTI->getShuffleCost(
+              TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask);
+        }
       }
       LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
       return VecStCost - ScalarStCost;
@@ -3816,9 +4528,10 @@
       InstructionCost ScalarEltCost =
           TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+      InstructionCost ScalarCallCost = NumOfInstructions * ScalarEltCost;
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
       InstructionCost VecCallCost =
@@ -3840,16 +4553,18 @@
       InstructionCost ScalarCost = 0;
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
+          if (Idx >= VL.size() || isa<UndefValue>(VL[Idx]))
+            continue;
+          auto *I = cast<Instruction>(VL[Idx]);
           ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
         }
-        for (Value *V : VL) {
+        for (Value *V : InstructionsOnly) {
           Instruction *I = cast<Instruction>(V);
           ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
         }
       }
-      for (Value *V : VL) {
-        Instruction *I = cast<Instruction>(V);
+      for (Value *V : InstructionsOnly) {
+        auto *I = cast<Instruction>(V);
         assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(I, CostKind);
       }
@@ -3863,8 +4578,8 @@
       } else {
         Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
-        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+        auto *Src0Ty = FixedVectorType::get(Src0SclTy, SelfVF);
+        auto *Src1Ty = FixedVectorType::get(Src1SclTy, SelfVF);
         VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
                                         TTI::CastContextHint::None, CostKind);
         VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
@@ -3873,6 +4588,10 @@
 
       SmallVector<int> Mask(E->Scalars.size());
       for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
+        if (isa<UndefValue>(E->Scalars[I])) {
+          Mask[I] = UndefMaskElem;
+          continue;
+        }
         auto *OpInst = cast<Instruction>(E->Scalars[I]);
         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
         Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
@@ -3899,10 +4618,13 @@
   if (VectorizableTree.size() != 2)
     return false;
 
-  // Handle splat and all-constants stores.
+  // Handle splat, all-constants stores and extractelement stores.
   if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
       (allConstant(VectorizableTree[1]->Scalars) ||
-       isSplat(VectorizableTree[1]->Scalars)))
+       isSplat(VectorizableTree[1]->Scalars) ||
+       all_of(VectorizableTree[1]->Scalars, [](Value *V) {
+         return isa<UndefValue>(V) || isa<ExtractElementInst>(V);
+       })))
     return true;
 
   // Gathering cost would be too much for tiny trees.
@@ -4076,8 +4798,6 @@
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = *VectorizableTree[I].get();
 
@@ -4093,14 +4813,33 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
+    // Also, need to exclude the cost for gather nodes, gcreate for gathered
+    // loads. These loads are already gathered and no need to count them again,
+    // if we were unable to vectorize them.
     if (TE.State == TreeEntry::NeedToGather &&
         std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
+                    GatheredLoadsEntriesFirst >= 0
+                        ? std::next(VectorizableTree.begin(),
+                                    GatheredLoadsEntriesFirst)
+                        : VectorizableTree.end(),
                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
                       return EntryPtr->State == TreeEntry::NeedToGather &&
                              EntryPtr->isSame(TE.Scalars);
                     }))
       continue;
+    // Exclude cost of gather loads nodes which are not used.
+    if (GatheredLoadsEntriesFirst >= 0 &&
+        I >= static_cast<unsigned>(GatheredLoadsEntriesFirst) &&
+        TE.State == TreeEntry::NeedToGather) {
+      assert(all_of(TE.Scalars,
+                    [this](Value *V) {
+                      return (isa<LoadInst>(V) && MustGather.contains(V)) ||
+                             isa<Constant>(V) ||
+                             V->getType()->isPtrOrPtrVectorTy();
+                    }) &&
+             "Expected loads, pointers or constants only.");
+      continue;
+    }
 
     InstructionCost C = getEntryCost(&TE);
     Cost += C;
@@ -4123,6 +4862,21 @@
     if (EphValues.count(EU.User))
       continue;
 
+    // BundleWidth varies in the treee, need to get the VF for each tree node.
+    const TreeEntry *TE = getTreeEntry(EU.Scalar);
+    SmallSet<unsigned int, 4> UserVFs;
+    unsigned BundleWidth = getEntryVF(TE, UserVFs, TE);
+    if (!TE->ReuseShuffleIndices.empty()) {
+      int Limit = TE->ReuseShuffleIndices.size();
+      BundleWidth = std::max<unsigned>(
+          BundleWidth,
+          PowerOf2Ceil(std::distance(
+              TE->ReuseShuffleIndices.begin(),
+              find_if(reverse(TE->ReuseShuffleIndices), [Limit](int I) {
+                return I < Limit;
+              }).base())));
+    }
+
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
@@ -4162,7 +4916,8 @@
 
 InstructionCost
 BoUpSLP::getGatherCost(FixedVectorType *Ty,
-                       const DenseSet<unsigned> &ShuffledIndices) const {
+                       const DenseSet<unsigned> &ShuffledIndices,
+                       bool NeedShuffleCost) const {
   unsigned NumElts = Ty->getNumElements();
   APInt DemandedElts = APInt::getNullValue(NumElts);
   for (unsigned I = 0; I < NumElts; ++I)
@@ -4171,42 +4926,49 @@
   InstructionCost Cost =
       TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
                                     /*Extract*/ false);
-  if (!ShuffledIndices.empty())
+  if (NeedShuffleCost)
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
 
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
+                                       unsigned VF) const {
   // Find the type of the operands in VL.
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VF);
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   DenseSet<unsigned> ShuffledElements;
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
-  for (unsigned I = VL.size(); I > 0; --I) {
+  bool NeedShuffleCost = false;
+  for (int I = VF; I > 0; --I) {
     unsigned Idx = I - 1;
-    if (!UniqueElements.insert(VL[Idx]).second)
+    if (isa<Constant>(VL[Idx])) {
+      // Ignore constant data elements.
       ShuffledElements.insert(Idx);
+      continue;
+    }
+    if (!UniqueElements.insert(VL[Idx]).second) {
+      ShuffledElements.insert(Idx);
+      NeedShuffleCost = true;
+    }
   }
-  return getGatherCost(VecTy, ShuffledElements);
+  return getGatherCost(VecTy, ShuffledElements, NeedShuffleCost);
 }
 
 // Perform operand reordering on the instructions in VL and return the reordered
 // operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R) {
+void BoUpSLP::reorderInputsAccordingToOpcode(
+    Instruction &VL0, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+    SmallVectorImpl<Value *> &Right, const DataLayout &DL, ScalarEvolution &SE,
+    const BoUpSLP &R) {
   if (VL.empty())
     return;
-  VLOperands Ops(VL, DL, SE, R);
+  VLOperands Ops(VL0, VL, DL, SE, R);
   // Reorder the operands in place.
   Ops.reorder();
   Left = Ops.getVL(0);
@@ -4214,11 +4976,14 @@
 }
 
 void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+  auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof);
+  if (llvm::empty(InstructionsOnly))
+    return;
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+  assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool {
     auto *I = cast<Instruction>(V);
     return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
   }));
@@ -4231,8 +4996,8 @@
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(
+        E->isOneOf(*llvm::reverse(InstructionsOnly).begin()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -4258,7 +5023,8 @@
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
+    SmallPtrSet<Value *, 16> Bundle(InstructionsOnly.begin(),
+                                    InstructionsOnly.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
       if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
         LastInst = &I;
@@ -4275,101 +5041,65 @@
 }
 
 Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
-  Value *Val0 =
-      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
-  Value *Vec = PoisonValue::get(VecTy);
-  unsigned InsIndex = 0;
-  for (Value *Val : VL) {
-    Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
+  // List of instructions/lanes from current block and/or the blocks which are
+  // part of the current loop. These instructions will be inserted at the end to
+  // make it possible to optimize loops and hoist invariant instructions out of
+  // the loops body with better chances for success.
+  SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
+  SmallSet<int, 4> PostponedIndices;
+  for (int I = 0, E = VL.size(); I < E; ++I) {
+    if (auto *Inst = dyn_cast<Instruction>(VL[I]))
+      if (Inst->getParent() == Builder.GetInsertBlock() &&
+          PostponedIndices.insert(I).second)
+        PostponedInsts.emplace_back(Inst, I);
+  }
+  if (Loop *L = LI->getLoopFor(Builder.GetInsertBlock())) {
+    for (int I = 0, E = VL.size(); I < E; ++I) {
+      if (auto *Inst = dyn_cast<Instruction>(VL[I]))
+        if (L->contains(Inst) && PostponedIndices.insert(I).second)
+          PostponedInsts.emplace_back(Inst, I);
+    }
+  }
+
+  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
+    // No need to insert undefs elements - exit.
+    if (isa<UndefValue>(V))
+      return Vec;
+    Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
-      continue;
+      return Vec;
     GatherSeq.insert(InsElt);
     CSEBlocks.insert(InsElt->getParent());
     // Add to our 'need-to-extract' list.
-    if (TreeEntry *Entry = getTreeEntry(Val)) {
+    if (TreeEntry *Entry = getTreeEntry(V)) {
       // Find which lane we need to extract.
-      unsigned FoundLane = std::distance(Entry->Scalars.begin(),
-                                         find(Entry->Scalars, Val));
+      unsigned FoundLane =
+          std::distance(Entry->Scalars.begin(), find(Entry->Scalars, V));
       assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
                                   find(Entry->ReuseShuffleIndices, FoundLane));
       }
-      ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
+      ExternalUses.emplace_back(V, InsElt, FoundLane);
     }
-  }
-
-  return Vec;
-}
-
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  InstructionsState S = getSameOpcode(VL);
-  if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL)) {
-        Value *V = vectorizeTree(E);
-        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
-          // Reshuffle to get only unique values.
-          // If some of the scalars are duplicated in the vectorization tree
-          // entry, we do not vectorize them but instead generate a mask for the
-          // reuses. But if there are several users of the same entry, they may
-          // have different vectorization factors. This is especially important
-          // for PHI nodes. In this case, we need to adapt the resulting
-          // instruction for the user vectorization factor and have to reshuffle
-          // it again to take only unique elements of the vector. Without this
-          // code the function incorrectly returns reduced vector instruction
-          // with the same elements, not with the unique ones.
-          // block:
-          // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
-          // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
-          // ... (use %2)
-          // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
-          // br %block
-          SmallVector<int, 4> UniqueIdxs;
-          SmallSet<int, 4> UsedIdxs;
-          int Pos = 0;
-          for (int Idx : E->ReuseShuffleIndices) {
-            if (UsedIdxs.insert(Idx).second)
-              UniqueIdxs.emplace_back(Pos);
-            ++Pos;
-          }
-          V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
-        }
-        return V;
-      }
-    }
-  }
+    return Vec;
+  };
 
-  // Check that every instruction appears once in this bundle.
-  SmallVector<int, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
-  if (VL.size() > 2) {
-    DenseMap<Value *, unsigned> UniquePositions;
-    for (Value *V : VL) {
-      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-      ReuseShuffleIndicies.emplace_back(Res.first->second);
-      if (Res.second || isa<Constant>(V))
-        UniqueValues.emplace_back(V);
-    }
-    // Do not shuffle single element or if number of unique values is not power
-    // of 2.
-    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
-        !llvm::isPowerOf2_32(UniqueValues.size()))
-      ReuseShuffleIndicies.clear();
-    else
-      VL = UniqueValues;
+  Value *Val0 =
+      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
+  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+  Value *Vec = PoisonValue::get(VecTy);
+  for (int I = 0, E = VL.size(); I < E; ++I) {
+    if (PostponedIndices.contains(I))
+      continue;
+    Vec = CreateInsertElement(Vec, VL[I], I);
   }
+  // Append instructions, which are/may be part of the loop, in the end to make
+  // it possible to hoist non-loop-based instructions.
+  for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
+    Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
 
-  Value *Vec = gather(VL);
-  if (!ReuseShuffleIndicies.empty()) {
-    Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
-    if (auto *I = dyn_cast<Instruction>(Vec)) {
-      GatherSeq.insert(I);
-      CSEBlocks.insert(I->getParent());
-    }
-  }
   return Vec;
 }
 
@@ -4377,11 +5107,13 @@
 /// Merges shuffle masks and emits final shuffle instruction, if required.
 class ShuffleInstructionBuilder {
   IRBuilderBase &Builder;
+  unsigned VF = 0;
   bool IsFinalized = false;
   SmallVector<int, 4> Mask;
 
 public:
-  ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {}
+  ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
+      : Builder(Builder), VF(VF) {}
 
   /// Adds a mask, inverting it before applying.
   void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -4408,8 +5140,9 @@
     SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
     int TermValue = std::min(Mask.size(), SubMask.size());
     for (int I = 0, E = SubMask.size(); I < E; ++I) {
-      if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {
-        NewMask[I] = E;
+      if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+          Mask[SubMask[I]] >= TermValue) {
+        NewMask[I] = UndefMaskElem;
         continue;
       }
       NewMask[I] = Mask[SubMask[I]];
@@ -4419,7 +5152,15 @@
 
   Value *finalize(Value *V) {
     IsFinalized = true;
-    if (Mask.empty())
+    if (VF == cast<FixedVectorType>(V->getType())->getNumElements() &&
+        Mask.empty())
+      return V;
+    SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem);
+    std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0);
+    addMask(NormalizedMask);
+
+    if (VF == cast<FixedVectorType>(V->getType())->getNumElements() &&
+        isUniform(Mask))
       return V;
     return Builder.CreateShuffleVector(V, Mask, "shuffle");
   }
@@ -4431,19 +5172,146 @@
 };
 } // namespace
 
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, unsigned VF) {
+  InstructionsState S = getSameOpcode(VL);
+  if (S.getOpcode()) {
+    if (TreeEntry *E = getTreeEntry(S.OpValue))
+      if (VL.size() == E->Scalars.size() && E->isSame(VL)) {
+        Value *V = vectorizeTree(E);
+        if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
+          if (!E->ReuseShuffleIndices.empty()) {
+            // Reshuffle to get only unique values.
+            // If some of the scalars are duplicated in the vectorization tree
+            // entry, we do not vectorize them but instead generate a mask for
+            // the reuses. But if there are several users of the same entry,
+            // they may have different vectorization factors. This is especially
+            // important for PHI nodes. In this case, we need to adapt the
+            // resulting instruction for the user vectorization factor and have
+            // to reshuffle it again to take only unique elements of the vector.
+            // Without this code the function incorrectly returns reduced vector
+            // instruction with the same elements, not with the unique ones.
+
+            // block:
+            // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
+            // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
+            // ... (use %2)
+            // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
+            // br %block
+            SmallVector<int, 4> UniqueIdxs;
+            SmallSet<int, 4> UsedIdxs;
+            int Pos = 0;
+            int Sz = VL.size();
+            for (int Idx : E->ReuseShuffleIndices) {
+              if (Idx != Sz && UsedIdxs.insert(Idx).second)
+                UniqueIdxs.emplace_back(Pos);
+              ++Pos;
+            }
+            assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
+                                            "less than original vector size.");
+            UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
+            V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
+          } else {
+            assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
+                   "Expected vectorization factor less "
+                   "than original vector size.");
+            SmallVector<int, 4> UniformMask(VF, 0);
+            std::iota(UniformMask.begin(), UniformMask.end(), 0);
+            V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
+          }
+        }
+        return V;
+      }
+  }
+
+  // Check that every instruction appears once in this bundle.
+  SmallVector<int, 4> ReuseShuffleIndicies;
+  SmallVector<Value *, 4> UniqueValues;
+  if (VL.size() > 2) {
+    DenseMap<Value *, unsigned> UniquePositions;
+    unsigned NumValues =
+        std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) {
+                                    return !isa<UndefValue>(V);
+                                  }).base());
+    VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
+    int UniqueVals = 0;
+    bool HasUndefs = false;
+    for (Value *V : VL.drop_back(VL.size() - VF)) {
+      if (isa<UndefValue>(V)) {
+        ReuseShuffleIndicies.emplace_back(UndefMaskElem);
+        HasUndefs = true;
+        continue;
+      }
+      if (isa<Constant>(V)) {
+        ReuseShuffleIndicies.emplace_back(UniqueValues.size());
+        UniqueValues.emplace_back(V);
+        continue;
+      }
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      ReuseShuffleIndicies.emplace_back(Res.first->second);
+      if (Res.second) {
+        UniqueValues.emplace_back(V);
+        ++UniqueVals;
+      }
+    }
+    if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {
+      // Emit pure splat vector.
+      ReuseShuffleIndicies.assign(VF, 0);
+    } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
+      ReuseShuffleIndicies.clear();
+      UniqueValues.clear();
+      UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
+    }
+    UniqueValues.append(VF - UniqueValues.size(),
+                        UndefValue::get(VL[0]->getType()));
+    VL = UniqueValues;
+  }
+
+  ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
+  Value *Vec = gather(VL);
+  if (!ReuseShuffleIndicies.empty()) {
+    ShuffleBuilder.addMask(ReuseShuffleIndicies);
+    Vec = ShuffleBuilder.finalize(Vec);
+    if (auto *I = dyn_cast<Instruction>(Vec)) {
+      GatherSeq.insert(I);
+      CSEBlocks.insert(I->getParent());
+    }
+  }
+  return Vec;
+}
+
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
+  Instruction *VL0 = E->getMainOp();
   if (E->VectorizedValue) {
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
     return E->VectorizedValue;
   }
 
-  ShuffleInstructionBuilder ShuffleBuilder(Builder);
+  SmallSet<unsigned, 4> UserVFs;
+  unsigned SelfVF = getEntryVF(E, UserVFs, E);
+  unsigned ShuffleVF = SelfVF;
+  if (!E->ReuseShuffleIndices.empty()) {
+    int Limit = E->Scalars.size();
+    ShuffleVF = std::max<unsigned>(
+        SelfVF, PowerOf2Ceil(std::distance(
+                    E->ReuseShuffleIndices.begin(),
+                    find_if(reverse(E->ReuseShuffleIndices), [Limit](int I) {
+                      return I < Limit;
+                    }).base())));
+  }
+  ShuffleInstructionBuilder ShuffleBuilder(Builder, ShuffleVF);
+  Type *ScalarTy = VL0->getType();
+  if (auto *Store = dyn_cast<StoreInst>(VL0))
+    ScalarTy = Store->getValueOperand()->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, SelfVF);
+  if (isa<UndefValue>(VL0))
+    return UndefValue::get(VecTy);
+
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
     setInsertPointAfterBundle(E);
-    Value *Vec = gather(E->Scalars);
+    Value *Vec = gather(makeArrayRef(E->Scalars).slice(0, SelfVF));
     if (NeedToShuffleReuses) {
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       Vec = ShuffleBuilder.finalize(Vec);
@@ -4459,13 +5327,9 @@
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize) &&
          "Unhandled state");
+  auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof);
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
-  Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       auto *PH = cast<PHINode>(VL0);
@@ -4473,9 +5337,8 @@
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
-
+      ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+      V = ShuffleBuilder.finalize(V);
       E->VectorizedValue = V;
 
       // PHINodes may have multiple entries from the same block. We want to
@@ -4493,7 +5356,7 @@
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(E->getOperand(i));
+        Value *Vec = vectorizeTree(E->getOperand(i), SelfVF);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -4512,12 +5375,12 @@
       return V;
     }
     case Instruction::ExtractValue: {
-      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
+      auto *LI = cast<LoadInst>(VL0->getOperand(0));
       Builder.SetInsertPoint(LI);
       auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
       Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
-      Value *NewV = propagateMetadata(V, E->Scalars);
+      Value *NewV = propagateMetadata(V, to_vector<4>(InstructionsOnly));
       ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       NewV = ShuffleBuilder.finalize(NewV);
@@ -4538,7 +5401,7 @@
     case Instruction::BitCast: {
       setInsertPointAfterBundle(E);
 
-      Value *InVec = vectorizeTree(E->getOperand(0));
+      Value *InVec = vectorizeTree(E->getOperand(0), SelfVF);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -4558,8 +5421,8 @@
     case Instruction::ICmp: {
       setInsertPointAfterBundle(E);
 
-      Value *L = vectorizeTree(E->getOperand(0));
-      Value *R = vectorizeTree(E->getOperand(1));
+      Value *L = vectorizeTree(E->getOperand(0), SelfVF);
+      Value *R = vectorizeTree(E->getOperand(1), SelfVF);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -4579,9 +5442,9 @@
     case Instruction::Select: {
       setInsertPointAfterBundle(E);
 
-      Value *Cond = vectorizeTree(E->getOperand(0));
-      Value *True = vectorizeTree(E->getOperand(1));
-      Value *False = vectorizeTree(E->getOperand(2));
+      Value *Cond = vectorizeTree(E->getOperand(0), SelfVF);
+      Value *True = vectorizeTree(E->getOperand(1), SelfVF);
+      Value *False = vectorizeTree(E->getOperand(2), SelfVF);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -4599,7 +5462,7 @@
     case Instruction::FNeg: {
       setInsertPointAfterBundle(E);
 
-      Value *Op = vectorizeTree(E->getOperand(0));
+      Value *Op = vectorizeTree(E->getOperand(0), SelfVF);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -4640,8 +5503,8 @@
     case Instruction::Xor: {
       setInsertPointAfterBundle(E);
 
-      Value *LHS = vectorizeTree(E->getOperand(0));
-      Value *RHS = vectorizeTree(E->getOperand(1));
+      Value *LHS = vectorizeTree(E->getOperand(0), SelfVF);
+      Value *RHS = vectorizeTree(E->getOperand(1), SelfVF);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -4653,7 +5516,7 @@
           RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
 
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
@@ -4672,31 +5535,78 @@
       setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
-      Instruction *NewLI;
       unsigned AS = LI->getPointerAddressSpace();
       Value *PO = LI->getPointerOperand();
+      unsigned MinIdx;
+      unsigned MaxIdx;
+      if (E->ReorderIndices.empty()) {
+        MinIdx = std::distance(E->Scalars.begin(),
+                               find_if(E->Scalars, Instruction::classof));
+        MaxIdx =
+            std::distance(
+                E->Scalars.begin(),
+                find_if(reverse(E->Scalars), Instruction::classof).base()) -
+            1;
+      } else {
+        std::tie(MinIdx, MaxIdx) = findMinMaxPos(E->ReorderIndices);
+      }
+      unsigned NumOfInstructions = MaxIdx - MinIdx + 1;
+      Value *VecPtr;
+      Instruction *VecLI;
+      Value *V;
+      Align CommonAlignment = LI->getAlign();
       if (E->State == TreeEntry::Vectorize) {
-
-        Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
-
+        unsigned Sz = DL->getTypeStoreSize(ScalarTy);
+        unsigned AlignedNumOfInstructions =
+            std::min(PowerOf2Ceil(NumOfInstructions),
+                     alignTo(NumOfInstructions * Sz, CommonAlignment) / Sz);
+        if (isPowerOf2_32(AlignedNumOfInstructions)) {
+          CommonAlignment =
+              commonAlignment(CommonAlignment, CommonAlignment.value() -
+                                                   (AlignedNumOfInstructions -
+                                                    NumOfInstructions));
+          auto *LoadVecTy =
+              FixedVectorType::get(ScalarTy, AlignedNumOfInstructions);
+          VecPtr = Builder.CreateBitCast(PO, LoadVecTy->getPointerTo(AS));
+          VecLI = Builder.CreateAlignedLoad(LoadVecTy, VecPtr, CommonAlignment);
+          V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly));
+        } else {
+          VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
+          SmallVector<Constant *, 4> Mask;
+          Mask.reserve(SelfVF);
+          Mask.append(NumOfInstructions, Builder.getInt1(/*V=*/true));
+          Mask.append(SelfVF - NumOfInstructions, Builder.getInt1(/*V=*/false));
+          VecLI = Builder.CreateMaskedLoad(VecPtr, CommonAlignment,
+                                           ConstantVector::get(Mask));
+          V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly));
+        }
         // The pointer operand uses an in-tree scalar so we add the new BitCast
         // to ExternalUses list to make sure that an extract will be generated
         // in the future.
         if (getTreeEntry(PO))
           ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
-
-        NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
-        Value *VecPtr = vectorizeTree(E->getOperand(0));
-        // Use the minimum alignment of the gathered loads.
-        Align CommonAlignment = LI->getAlign();
-        for (Value *V : E->Scalars)
+        for (Value *V : InstructionsOnly)
           CommonAlignment =
               commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-        NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
+        unsigned NormalizedSz = llvm::PowerOf2Ceil(NumOfInstructions);
+        Value *VecPtr = vectorizeTree(E->getOperand(0), SelfVF);
+        if (NormalizedSz != SelfVF) {
+          // Reduce the original vector to optimize masked gather.
+          SmallVector<int, 4> RedMask(NormalizedSz, 0);
+          std::iota(RedMask.begin(), RedMask.end(), 0);
+          VecPtr = Builder.CreateShuffleVector(VecPtr, RedMask);
+        }
+        SmallVector<Constant *, 4> Mask;
+        Mask.reserve(SelfVF);
+        Mask.append(NumOfInstructions, Builder.getInt1(/*V=*/true));
+        Mask.append(NormalizedSz - NumOfInstructions,
+                    Builder.getInt1(/*V=*/false));
+        VecLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment,
+                                           ConstantVector::get(Mask));
+        V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly));
       }
-      Value *V = propagateMetadata(NewLI, E->Scalars);
 
       ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
@@ -4706,30 +5616,64 @@
       return V;
     }
     case Instruction::Store: {
-      bool IsReorder = !E->ReorderIndices.empty();
-      auto *SI = cast<StoreInst>(
-          IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
+      bool IsReorder = E->updateStateIfReorder();
+      if (IsReorder)
+        VL0 = E->getMainOp();
+      auto *SI = cast<StoreInst>(VL0);
       unsigned AS = SI->getPointerAddressSpace();
 
       setInsertPointAfterBundle(E);
 
-      Value *VecValue = vectorizeTree(E->getOperand(0));
+      Value *VecValue =
+          vectorizeTree(E->getOperand(0),
+                        PowerOf2Ceil(std::distance(InstructionsOnly.begin(),
+                                                   InstructionsOnly.end())));
       ShuffleBuilder.addMask(E->ReorderIndices);
       VecValue = ShuffleBuilder.finalize(VecValue);
 
       Value *ScalarPtr = SI->getPointerOperand();
-      Value *VecPtr = Builder.CreateBitCast(
-          ScalarPtr, VecValue->getType()->getPointerTo(AS));
-      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
-                                                 SI->getAlign());
+
+      Align Alignment = SI->getAlign();
+      unsigned MinIdx;
+      unsigned MaxIdx;
+      if (E->ReorderIndices.empty()) {
+        MinIdx = std::distance(E->Scalars.begin(),
+                               find_if(E->Scalars, Instruction::classof));
+        MaxIdx =
+            std::distance(
+                E->Scalars.begin(),
+                find_if(reverse(E->Scalars), Instruction::classof).base()) -
+            1;
+      } else {
+        std::tie(MinIdx, MaxIdx) = findMinMaxPos(E->ReorderIndices);
+      }
+      Value *VecPtr;
+      Instruction *VecSI;
+      if (std::distance(InstructionsOnly.begin(), InstructionsOnly.end()) ==
+          SelfVF) {
+        VecPtr = Builder.CreateBitCast(
+            ScalarPtr,
+            FixedVectorType::get(ScalarTy, SelfVF)->getPointerTo(AS));
+        VecSI = Builder.CreateAlignedStore(VecValue, VecPtr, Alignment);
+      } else {
+        VecPtr = Builder.CreateBitCast(ScalarPtr,
+                                       VecValue->getType()->getPointerTo(AS));
+        SmallVector<Constant *, 4> Mask(SelfVF, Builder.getInt1(/*V=*/false));
+        for (unsigned I = 0; I < SelfVF; ++I) {
+          if (E->ReorderIndices[I] != SelfVF)
+            Mask[I] = Builder.getInt1(/*V=*/true);
+        }
+        VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, Alignment,
+                                          ConstantVector::get(Mask));
+      }
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
       // future.
       if (getTreeEntry(ScalarPtr))
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+        ExternalUses.emplace_back(ScalarPtr, cast<User>(VecPtr), 0);
 
-      Value *V = propagateMetadata(ST, E->Scalars);
+      Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly));
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -4738,7 +5682,7 @@
     case Instruction::GetElementPtr: {
       setInsertPointAfterBundle(E);
 
-      Value *Op0 = vectorizeTree(E->getOperand(0));
+      Value *Op0 = vectorizeTree(E->getOperand(0), SelfVF);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -4754,18 +5698,20 @@
                                               ->getPointerOperandType()
                                               ->getScalarType());
         for (Value *&V : VL) {
+          if (isa<UndefValue>(V))
+            continue;
           auto *CI = cast<ConstantInt>(V);
           V = ConstantExpr::getIntegerCast(CI, Ty,
                                            CI->getValue().isSignBitSet());
         }
-        Value *OpVec = vectorizeTree(VL);
+        Value *OpVec = vectorizeTree(VL, SelfVF);
         OpVecs.push_back(OpVec);
       }
 
       Value *V = Builder.CreateGEP(
           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
       if (Instruction *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
 
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
@@ -4802,20 +5748,18 @@
           continue;
         }
 
-        Value *OpVec = vectorizeTree(E->getOperand(j));
+        Value *OpVec = vectorizeTree(E->getOperand(j), SelfVF);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
 
       Function *CF;
       if (!UseIntrinsic) {
-        VFShape Shape =
-            VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
-                                  VecTy->getNumElements())),
-                         false /*HasGlobalPred*/);
+        VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(SelfVF),
+                                     false /*HasGlobalPred*/);
         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
       } else {
-        Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+        Type *Tys[] = {FixedVectorType::get(CI->getType(), SelfVF)};
         CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
       }
 
@@ -4829,7 +5773,7 @@
       if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
-      propagateIRFlags(V, E->Scalars, VL0);
+      propagateIRFlags(V, to_vector<4>(InstructionsOnly), VL0);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -4848,11 +5792,11 @@
       Value *LHS = nullptr, *RHS = nullptr;
       if (Instruction::isBinaryOp(E->getOpcode())) {
         setInsertPointAfterBundle(E);
-        LHS = vectorizeTree(E->getOperand(0));
-        RHS = vectorizeTree(E->getOperand(1));
+        LHS = vectorizeTree(E->getOperand(0), SelfVF);
+        RHS = vectorizeTree(E->getOperand(1), SelfVF);
       } else {
         setInsertPointAfterBundle(E);
-        LHS = vectorizeTree(E->getOperand(0));
+        LHS = vectorizeTree(E->getOperand(0), SelfVF);
       }
 
       if (E->VectorizedValue) {
@@ -4877,13 +5821,17 @@
       // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
       ValueList OpScalars, AltScalars;
-      unsigned e = E->Scalars.size();
-      SmallVector<int, 8> Mask(e);
-      for (unsigned i = 0; i < e; ++i) {
+      SmallVector<int, 8> Mask(SelfVF);
+      for (unsigned i = 0; i < SelfVF; ++i) {
+        if (isa<UndefValue>(E->Scalars[i])) {
+          Mask[i] = i;
+          OpScalars.push_back(E->Scalars[i]);
+          continue;
+        }
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
         if (OpInst->getOpcode() == E->getAltOpcode()) {
-          Mask[i] = e + i;
+          Mask[i] = SelfVF + i;
           AltScalars.push_back(E->Scalars[i]);
         } else {
           Mask[i] = i;
@@ -4896,7 +5844,7 @@
 
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -4974,6 +5922,9 @@
            "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
+    if (!Vec && E->getOpcode() == Instruction::Load &&
+        E->UserTreeIndices.empty() && E != VectorizableTree[0].get())
+      Vec = vectorizeTree(E);
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
@@ -5052,6 +6003,8 @@
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
+      if (isa<UndefValue>(Scalar))
+        continue;
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
@@ -5211,9 +6164,10 @@
     }
   };
 
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL) {
+  for (Value *V : InstructionsOnly) {
     if (!extendSchedulingRegion(V, S)) {
       // If the scheduling region got new instructions at the lower end (or it
       // is a new region for the first bundle). This makes it necessary to
@@ -5226,7 +6180,7 @@
     }
   }
 
-  for (Value *V : VL) {
+  for (Value *V : InstructionsOnly) {
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
@@ -5270,7 +6224,9 @@
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
-  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+  assert(Bundle->isSchedulingEntity() &&
+         (Bundle->isPartOfBundle() ||
+          llvm::count_if(VL, Instruction::classof) == 1) &&
          "tried to unbundle something which is not a bundle");
 
   // Un-bundle: make single instructions out of the bundle.
@@ -5576,7 +6532,9 @@
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (getTreeEntry(SD->Inst) != nullptr &&
+                  llvm::count_if(getTreeEntry(SD->Inst)->Scalars,
+                                 Instruction::classof) > 1) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
@@ -5681,8 +6639,13 @@
   // If we didn't encounter a memory access in the expression tree, or if we
   // gave up for some reason, just return the width of V. Otherwise, return the
   // maximum width we found.
-  if (!MaxWidth || FoundUnknownInst)
-    Width = DL->getTypeSizeInBits(V->getType());
+  if (!MaxWidth || FoundUnknownInst) {
+    // For cmp instructions use the size of its operands, not the size of i1
+    // type.
+    if (auto *CI = dyn_cast<CmpInst>(V))
+      V = CI->getOperand(0);
+    Width = std::max<int>(Width, DL->getTypeSizeInBits(V->getType()));
+  }
 
   for (Instruction *I : Visited)
     InstrElementSize[I] = Width;
@@ -6044,17 +7007,58 @@
   return Changed;
 }
 
+/// Order may have elements assigned special value (size) which is out of
+/// bounds. Such indices only appear on places which correspond to undef values
+/// (see canReuseExtract for details) and used in order to avoid undef values
+/// have effect on operands ordering.
+/// The first loop below simply finds all unused indices and then the next loop
+/// nest assigns these indecies for undef values positions.
+/// As an example below Order has two undef positions and they have assigned
+/// values 3 and 7 respectively:
+/// before:  6 9 5 4 9 2 1 0
+/// after:   6 3 5 4 7 2 1 0
+static void fixupOrderingIndicies(SmallVectorImpl<unsigned> &Order) {
+  const unsigned Sz = Order.size();
+  SmallBitVector UsedIndices(Sz);
+  const unsigned BoundVal = Sz;
+  for (unsigned I : Order)
+    if (I != BoundVal)
+      UsedIndices[I] = true;
+  unsigned Idx = 0;
+  for (unsigned &I : Order) {
+    if (I == BoundVal) {
+      // Find first non-used index.
+      for (; Idx != Sz; ++Idx)
+        if (!UsedIndices[Idx])
+          break;
+      // Set correct index.
+      I = Idx;
+      ++Idx;
+    }
+  }
+}
+
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned Idx) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
-  const unsigned MinVF = R.getMinVecRegSize() / Sz;
   unsigned VF = Chain.size();
 
-  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
+  if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
+  const unsigned MinVF = R.getMinVecRegSize() / Sz;
+  SmallVector<Value *, 8> FixedChain;
+  unsigned NewSize = PowerOf2Ceil(std::max(VF, MinVF));
+  if (NewSize != VF) {
+    FixedChain.reserve(NewSize);
+    FixedChain.append(Chain.begin(), Chain.end());
+    FixedChain.append(NewSize - Chain.size(),
+                      UndefValue::get(Chain[0]->getType()));
+    Chain = FixedChain;
+    VF = NewSize;
+  }
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
 
@@ -6062,10 +7066,12 @@
   Optional<ArrayRef<unsigned>> Order = R.bestOrder();
   // TODO: Handle orders of size less than number of elements in the vector.
   if (Order && Order->size() == Chain.size()) {
+    SmallVector<unsigned, 4> NewOrder(Order->begin(), Order->end());
+    fixupOrderingIndicies(NewOrder);
     // TODO: reorder tree nodes without tree rebuilding.
     SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
-    llvm::transform(*Order, ReorderedOps.begin(),
-                    [Chain](const unsigned Idx) { return Chain[Idx]; });
+    transform(NewOrder, ReorderedOps.begin(),
+              [Chain](const unsigned Idx) { return Chain[Idx]; });
     R.buildTree(ReorderedOps);
   }
   if (R.isTreeTinyAndNotFullyVectorizable())
@@ -6106,6 +7112,10 @@
   int E = Stores.size();
   SmallBitVector Tails(E, false);
   int MaxIter = MaxStoreLookup.getValue();
+  unsigned MaxVecRegSize = R.getMaxVecRegSize();
+  unsigned EltSize = R.getVectorElementSize(Stores.front());
+
+  unsigned MaxElts = PowerOf2Floor(MaxVecRegSize / EltSize);
   SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
       E, std::make_pair(E, INT_MAX));
   SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
@@ -6156,6 +7166,20 @@
         break;
   }
 
+  // Check if we allow masked stores.
+  unsigned MinVF =
+      std::max<unsigned>(2U, PowerOf2Ceil(R.getMinVecRegSize() / EltSize));
+  unsigned MaxVF =
+      std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+  SmallBitVector MaskedStoresSupported(std::max<int>(MaxVF, MinVF) + 1, false);
+  for (unsigned I = MinVF; I <= MaxVF; I *= 2) {
+    if (TTI->isLegalMaskedStore(
+            FixedVectorType::get(Stores.front()->getValueOperand()->getType(),
+                                 I),
+            cast<StoreInst>(Stores.front())->getAlign()))
+      MaskedStoresSupported.set(I);
+  }
+
   // For stores that start but don't end a link in the chain:
   for (int Cnt = E; Cnt > 0; --Cnt) {
     int I = Cnt - 1;
@@ -6168,7 +7192,12 @@
     while (I != E && !VectorizedStores.count(Stores[I])) {
       Operands.push_back(Stores[I]);
       Tails.set(I);
-      if (ConsecutiveChain[I].second != 1) {
+      int VF = std::min(
+          MaxVF, std::max<unsigned>(MinVF, PowerOf2Ceil(Operands.size())));
+      if (((!MaskedStoresSupported.test(VF) ||
+            Operands.size() < MinNonPow2StoresSize.getValue()) &&
+           ConsecutiveChain[I].second != 1) ||
+          ConsecutiveChain[I].second >= static_cast<int>(MaxVF)) {
         // Mark the new end in the chain and go back, if required. It might be
         // required if the original stores come in reversed order, for example.
         if (ConsecutiveChain[I].first != E &&
@@ -6185,37 +7214,43 @@
     }
     assert(!Operands.empty() && "Expected non-empty list of stores.");
 
-    unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Operands[0]);
-    unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
-
-    unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
-    unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
-                              MaxElts);
-
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
     unsigned StartIdx = 0;
-    for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
-      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-        ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
-        if (!VectorizedStores.count(Slice.front()) &&
-            !VectorizedStores.count(Slice.back()) &&
-            vectorizeStoreChain(Slice, R, Cnt)) {
-          // Mark the vectorized stores so that we don't vectorize them again.
-          VectorizedStores.insert(Slice.begin(), Slice.end());
-          Changed = true;
-          // If we vectorized initial block, no need to try to vectorize it
-          // again.
-          if (Cnt == StartIdx)
-            StartIdx += Size;
-          Cnt += Size;
-          continue;
+    unsigned E = Operands.size();
+    unsigned StartSize =
+        std::min(MaxVF, std::max<unsigned>(MinVF, PowerOf2Ceil(E)));
+    for (unsigned Size = StartSize; Size >= 2; Size /= 2) {
+      bool IsLegalMaskedStores =
+          MaskedStoresSupported.test(std::max(MinVF, Size));
+      if (!IsLegalMaskedStores && Size < MinVF)
+        continue;
+      for (unsigned Cnt = StartIdx; Cnt + 1 + Size / 2 <= E;) {
+        unsigned NumStores = std::min(Size, E - Cnt);
+        // Try vectorization only if it is legal.
+        if ((IsLegalMaskedStores &&
+             NumStores >= MinNonPow2ValuesSize.getValue()) ||
+            (NumStores >= MinVF && isPowerOf2_32(NumStores))) {
+          ArrayRef<Value *> Slice =
+              makeArrayRef(Operands).slice(Cnt, NumStores);
+          if (!VectorizedStores.count(Slice.front()) &&
+              !VectorizedStores.count(Slice.back()) &&
+              vectorizeStoreChain(Slice, R, Cnt)) {
+            // Mark the vectorized stores so that we don't vectorize them again.
+            VectorizedStores.insert(Slice.begin(), Slice.end());
+            Changed = true;
+            // If we vectorized initial block, no need to try to vectorize it
+            // again.
+            if (Cnt == StartIdx)
+              StartIdx += Size;
+            Cnt += Size;
+            continue;
+          }
         }
         ++Cnt;
       }
       // Check if the whole array was vectorized already - exit.
-      if (StartIdx >= Operands.size())
+      if (StartIdx >= E)
         break;
     }
   }
@@ -6300,9 +7335,13 @@
     }
   }
 
+  unsigned NumElts = VL.size();
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
-  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  unsigned MaxVF = std::max<unsigned>(NumElts >= MinNonPow2ValuesSize.getValue()
+                                          ? PowerOf2Ceil(NumElts)
+                                          : PowerOf2Floor(NumElts),
+                                      MinVF);
   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
@@ -6312,19 +7351,26 @@
     });
     return false;
   }
-
   bool Changed = false;
   bool CandidateFound = false;
   InstructionCost MinCost = SLPCostThreshold.getValue();
 
   bool CompensateUseCost =
       !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
-        return V && isa<InsertElementInst>(V);
+        return isa_and_nonnull<InsertElementInst>(V);
       });
+  SmallVector<Value *, 4> NormalizedVL;
+  if (!CompensateUseCost && MaxVF > VL.size()) {
+    NormalizedVL.append(VL.begin(), VL.end());
+    NormalizedVL.append(MaxVF - VL.size(), UndefValue::get(I0->getType()));
+    VL = NormalizedVL;
+  }
+
   assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
          "Each scalar expected to have an associated InsertElement user.");
 
-  unsigned NextInst = 0, MaxInst = VL.size();
+  unsigned NextInst = 0, MaxInst = NumElts;
+  bool Width3Tried = MaxVF < 4;
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
@@ -6332,15 +7378,24 @@
     auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
     if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
+    int Width = VF;
+    // Try the vectorization factor 4 once again if tried VF 4 already, but try
+    // to vectorize bundles of 3 elements. Try VF 2 after bundles size 3.
+    if (VF == 2 && !Width3Tried) {
+      VF = 4;
+      Width = 3;
+      Width3Tried = true;
+    }
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned OpsWidth = 0;
 
-      if (I + VF > MaxInst)
+      if (I + Width > MaxInst)
         OpsWidth = MaxInst - I;
       else
-        OpsWidth = VF;
+        OpsWidth = Width;
 
-      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+      if ((Width == 3 && OpsWidth != 3) || (VF > MinVF && OpsWidth <= VF / 2) ||
+          (VF == MinVF && OpsWidth < 2))
         break;
 
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -6353,19 +7408,30 @@
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                         << "\n");
+      SmallVector<Value *, 8> FixedChain;
+      if (OpsWidth != VF) {
+        unsigned NewSize = VF;
+        FixedChain.reserve(NewSize);
+        FixedChain.append(Ops.begin(), Ops.end());
+        FixedChain.append(NewSize - Ops.size(),
+                          UndefValue::get(Ops[0]->getType()));
+        Ops = FixedChain;
+      }
+      assert(Ops.size() == VF &&
+             "Operations must have same size as vectorization factor.");
 
       R.buildTree(Ops);
-      Optional<ArrayRef<unsigned>> Order = R.bestOrder();
-      // TODO: check if we can allow reordering for more cases.
-      if (AllowReorder && Order) {
-        // TODO: reorder tree nodes without tree rebuilding.
-        // Conceptually, there is nothing actually preventing us from trying to
-        // reorder a larger list. In fact, we do exactly this when vectorizing
-        // reductions. However, at this point, we only expect to get here when
-        // there are exactly two operations.
-        assert(Ops.size() == 2);
-        Value *ReorderedOps[] = {Ops[1], Ops[0]};
-        R.buildTree(ReorderedOps, None);
+      if (AllowReorder) {
+        Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+        if (Order) {
+          // TODO: reorder tree nodes without tree rebuilding.
+          SmallVector<unsigned, 4> NewOrder(Order->begin(), Order->end());
+          fixupOrderingIndicies(NewOrder);
+          SmallVector<Value *, 4> ReorderedOps(Ops.size());
+          transform(NewOrder, ReorderedOps.begin(),
+                    [Ops](const unsigned Idx) { return Ops[Idx]; });
+          R.buildTree(ReorderedOps);
+        }
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
@@ -6923,12 +7989,12 @@
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
-  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
-    // If there are a sufficient number of reduction values, reduce
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, const DataLayout &DL) {
+    // If there are a sufficient number of reduction values, extend
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < 4)
+    if (NumReducedVals < 3)
       return false;
 
     // Intersect the fast-math-flags from all reduction operations.
@@ -6966,9 +8032,9 @@
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
-    SmallVector<Value *, 16> IgnoreList;
+    SmallVector<Value *, 16> PostoponedIndicies;
     for (ReductionOpsType &RdxOp : ReductionOps)
-      IgnoreList.append(RdxOp.begin(), RdxOp.end());
+      PostoponedIndicies.append(RdxOp.begin(), RdxOp.end());
 
     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
     if (NumReducedVals > ReduxWidth) {
@@ -7001,34 +8067,63 @@
 
     Value *VectorizedTree = nullptr;
     unsigned i = 0;
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
-      ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+    ReduxWidth = PowerOf2Ceil(NumReducedVals);
+    // Try once the non-power-2 vectorization and only if it is unsuccessfull,
+    // try to split it for less power-2 chunks.
+    while (
+        (ReduxWidth > NumReducedVals || i < NumReducedVals - ReduxWidth + 1) &&
+        ReduxWidth > 2) {
+      ArrayRef<Value *> VL;
+      SmallVector<Value *, 4> NormalizedVL;
+      // Still need to normalize to power-of-2 size.
+      if (ReduxWidth > NumReducedVals) {
+        NormalizedVL.append(&ReducedVals[i],
+                            &ReducedVals[i] + ReducedVals.size() - i);
+        NormalizedVL.append(ReduxWidth - NormalizedVL.size(),
+                            UndefValue::get(ReducedVals[i]->getType()));
+        VL = NormalizedVL;
+      } else {
+        VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+      }
+      V.buildTree(VL, ExternallyUsedValues, PostoponedIndicies);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
       if (Order) {
         assert(Order->size() == VL.size() &&
                "Order size must be the same as number of vectorized "
                "instructions.");
+        SmallVector<unsigned, 4> NewOrder(Order->begin(), Order->end());
+        fixupOrderingIndicies(NewOrder);
         // TODO: reorder tree nodes without tree rebuilding.
         SmallVector<Value *, 4> ReorderedOps(VL.size());
-        llvm::transform(*Order, ReorderedOps.begin(),
+        llvm::transform(NewOrder, ReorderedOps.begin(),
                         [VL](const unsigned Idx) { return VL[Idx]; });
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+        V.buildTree(ReorderedOps, ExternallyUsedValues, PostoponedIndicies);
       }
-      if (V.isTreeTinyAndNotFullyVectorizable())
-        break;
-      if (V.isLoadCombineReductionCandidate(RdxKind))
+      if (V.isTreeTinyAndNotFullyVectorizable() ||
+          V.isLoadCombineReductionCandidate(RdxKind)) {
+        // Try with smaller reductions.
+        if (ReduxWidth > NumReducedVals) {
+          ReduxWidth /= 2;
+          continue;
+        }
         break;
+      }
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
       InstructionCost TreeCost = V.getTreeCost();
-      InstructionCost ReductionCost =
-          getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+      InstructionCost ReductionCost = getReductionCost(
+          TTI, ReducedVals[i],
+          ReduxWidth > NumReducedVals ? NumReducedVals : VL.size(), ReduxWidth);
       InstructionCost Cost = TreeCost + ReductionCost;
       if (!Cost.isValid()) {
         LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+        // Try with smaller reductions.
+        if (ReduxWidth > NumReducedVals) {
+          ReduxWidth /= 2;
+          continue;
+        }
         return false;
       }
       if (Cost >= -SLPCostThreshold) {
@@ -7040,6 +8135,11 @@
                  << " and threshold "
                  << ore::NV("Threshold", -SLPCostThreshold);
         });
+        // Try with smaller reductions.
+        if (ReduxWidth > NumReducedVals) {
+          ReduxWidth /= 2;
+          continue;
+        }
         break;
       }
 
@@ -7065,6 +8165,85 @@
       else
         Builder.SetInsertPoint(RdxRootInst);
 
+      // Check if we reduced non-power-2 number of elements and need to extend
+      // the scalars with the elements that does not affect the result (0 for
+      // add, or, xor, 1 for mul, ~0 for and, min for max and max for min).
+      if (ReduxWidth > NumReducedVals) {
+        Value *ShuffleOp = nullptr;
+        Type *ScalarTy = ReducedVals[i]->getType();
+        switch (RdxKind) {
+        case RecurKind::Add:
+        case RecurKind::Or:
+        case RecurKind::FAdd:
+        case RecurKind::Xor:
+          ShuffleOp =
+              ConstantVector::getSplat(ElementCount::getFixed(ReduxWidth),
+                                       Constant::getNullValue(ScalarTy));
+          break;
+        case RecurKind::And:
+          ShuffleOp =
+              ConstantVector::getSplat(ElementCount::getFixed(ReduxWidth),
+                                       Constant::getAllOnesValue(ScalarTy));
+          break;
+        case RecurKind::Mul:
+          ShuffleOp =
+              ConstantVector::getSplat(ElementCount::getFixed(ReduxWidth),
+                                       ConstantInt::get(ScalarTy, 1));
+          break;
+        case RecurKind::FMul:
+          ShuffleOp =
+              ConstantVector::getSplat(ElementCount::getFixed(ReduxWidth),
+                                       ConstantFP::get(ScalarTy, 1.0));
+          break;
+        case RecurKind::UMax:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantInt::get(ScalarTy, APInt::getMinValue(
+                                             DL.getTypeSizeInBits(ScalarTy))));
+          break;
+        case RecurKind::SMax:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantInt::get(ScalarTy, APInt::getSignedMinValue(
+                                             DL.getTypeSizeInBits(ScalarTy))));
+          break;
+        case RecurKind::UMin:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantInt::get(ScalarTy, APInt::getMaxValue(
+                                             DL.getTypeSizeInBits(ScalarTy))));
+          break;
+        case RecurKind::SMin:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantInt::get(ScalarTy, APInt::getSignedMaxValue(
+                                             DL.getTypeSizeInBits(ScalarTy))));
+          break;
+        case RecurKind::FMax:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantFP::get(ScalarTy,
+                              APFloat::getLargest(ScalarTy->getFltSemantics(),
+                                                  /*Negative=*/true)));
+          break;
+        case RecurKind::FMin:
+          ShuffleOp = ConstantVector::getSplat(
+              ElementCount::getFixed(ReduxWidth),
+              ConstantFP::get(ScalarTy,
+                              APFloat::getLargest(ScalarTy->getFltSemantics(),
+                                                  /*Negative=*/false)));
+          break;
+        default:
+          llvm_unreachable(
+              "Expected arithmetic or min/max reduction operation");
+        }
+        SmallVector<int, 4> Mask(ReduxWidth);
+        std::iota(Mask.begin(), Mask.begin() + NumReducedVals, 0);
+        std::iota(Mask.begin() + NumReducedVals, Mask.end(), ReduxWidth);
+        VectorizedRoot = Builder.CreateShuffleVector(
+            VectorizedRoot, ShuffleOp, Mask, "reduction.normalization");
+      }
+
       Value *ReducedSubTree =
           emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
 
@@ -7078,7 +8257,10 @@
                                   ReducedSubTree, "op.rdx", ReductionOps);
       }
       i += ReduxWidth;
-      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
+      if (ReduxWidth > NumReducedVals)
+        ReduxWidth /= 2;
+      else
+        ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
 
     if (VectorizedTree) {
@@ -7113,18 +8295,28 @@
 
       // Mark all scalar reduction ops for deletion, they are replaced by the
       // vector reductions.
-      V.eraseInstructions(IgnoreList);
+      V.eraseInstructions(PostoponedIndicies);
     }
     return VectorizedTree != nullptr;
   }
 
+  /// Extracts extra argument values to the vector to try to use them as
+  /// the vectorization roots.
+  SmallVector<Value *, 4> getCopyOfExtraArgValues() const {
+    SmallVector<Value *, 4> Args(ExtraArgs.size());
+    transform(
+        ExtraArgs, Args.begin(),
+        [](const std::pair<Instruction *, Value *> &P) { return P.second; });
+    return Args;
+  }
+
   unsigned numReductionValues() const { return ReducedVals.size(); }
 
 private:
   /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    Value *FirstReducedVal,
-                                   unsigned ReduxWidth) {
+                                   unsigned NumOfScalars, unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
     FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
     InstructionCost VectorCost, ScalarCost;
@@ -7175,7 +8367,12 @@
     }
 
     // Scalar cost is repeated for N-1 elements.
-    ScalarCost *= (ReduxWidth - 1);
+    ScalarCost *= (NumOfScalars - 1);
+    // Need to reshuffle elements to replace undefs with the real constant
+    // values.
+    if (NumOfScalars != ReduxWidth)
+      VectorCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VectorTy);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
                       << " for reduction that starts with " << *FirstReducedVal
                       << " (It is a splitting reduction)\n");
@@ -7327,10 +8524,6 @@
   return false;
 }
 
-static bool PhiTypeSorterFunc(Value *V, Value *V2) {
-  return V->getType() < V2->getType();
-}
-
 /// Try and get a reduction value from a phi node.
 ///
 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
@@ -7413,7 +8606,7 @@
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI,
+    TargetTransformInfo *TTI, const DataLayout &DL,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
@@ -7445,11 +8638,21 @@
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
       if (HorRdx.matchAssociativeReduction(P, Inst)) {
-        if (HorRdx.tryToReduce(R, TTI)) {
+        if (HorRdx.tryToReduce(R, TTI, DL)) {
           Res = true;
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
           P = nullptr;
+          // Try to vectorize ExtraArgs.
+          // Continue analysis for the instruction from the same basic block
+          // only to save compile time.
+          if (++Level < RecursionMaxDepth)
+            for (auto *Op : HorRdx.getCopyOfExtraArgValues())
+              if (VisitedInstrs.insert(Op).second)
+                if (auto *I = dyn_cast<Instruction>(Op))
+                  if (!isa<PHINode>(I) && !R.isDeleted(I) &&
+                      I->getParent() == BB)
+                    Stack.emplace_back(I, Level);
           continue;
         }
       }
@@ -7499,7 +8702,7 @@
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *DL,
                                                   ExtraVectorization);
 }
 
@@ -7540,31 +8743,37 @@
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
                                          BoUpSLP &R) {
-  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
-    return true;
-
   bool OpsChanged = false;
   for (int Idx = 0; Idx < 2; ++Idx) {
     OpsChanged |=
         vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
   }
-  return OpsChanged;
+  return OpsChanged ||
+         tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R);
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
-    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+    SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
+    bool AtTerminator) {
   bool OpsChanged = false;
+  SmallVector<Instruction *, 4> PostponedCmps;
   for (auto *I : reverse(Instructions)) {
     if (R.isDeleted(I))
       continue;
-    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
       OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
-    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
       OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
-    else if (auto *CI = dyn_cast<CmpInst>(I))
-      OpsChanged |= vectorizeCmpInst(CI, BB, R);
+    } else if (auto *CI = dyn_cast<CmpInst>(I)) {
+      if (!AtTerminator)
+        PostponedCmps.push_back(CI);
+      else
+        OpsChanged |= vectorizeCmpInst(CI, BB, R);
+    }
   }
-  Instructions.clear();
+  // Insert in reverse order since the PostponedCmps vector was filled in
+  // reverse order.
+  Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
   return OpsChanged;
 }
 
@@ -7572,6 +8781,9 @@
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
+  // Maps phi nodes to the non-phi nodes found in the use tree for each phi
+  // node.
+  DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -7584,22 +8796,112 @@
       if (!P)
         break;
 
-      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+      // No need to analyze deleted and/or vectorized nodes.
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
+          !P->getType()->isVectorTy())
         Incoming.push_back(P);
     }
 
-    // Sort by type.
-    llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+    // Find the corresponding non-phi nodes for better matching when trying to
+    // build the tree.
+    for (Value *V : Incoming) {
+      SmallVectorImpl<Value *> &Opcodes =
+          PHIToOpcodes.try_emplace(V).first->getSecond();
+      if (!Opcodes.empty())
+        continue;
+      SmallVector<Value *, 4> Nodes(1, V);
+      SmallPtrSet<Value *, 4> Visited;
+      while (!Nodes.empty()) {
+        auto *PHI = cast<PHINode>(Nodes.pop_back_val());
+        if (!Visited.insert(PHI).second)
+          continue;
+        for (Value *V : PHI->incoming_values()) {
+          if (auto *PHI1 = dyn_cast<PHINode>((V))) {
+            Nodes.push_back(PHI1);
+            continue;
+          }
+          Opcodes.emplace_back(V);
+        }
+      }
+    }
+
+    // Sort by type, parent, operands.
+    stable_sort(Incoming, [&PHIToOpcodes](Value *V1, Value *V2) {
+      if (V1->getType() < V2->getType())
+        return true;
+      if (V1->getType() > V2->getType())
+        return false;
+      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+      if (Opcodes1.size() < Opcodes2.size())
+        return true;
+      if (Opcodes1.size() > Opcodes2.size())
+        return false;
+      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+        // Undefs are compatible with any other value.
+        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+          continue;
+        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+            if (I1->getParent() < I2->getParent())
+              return true;
+            if (I1->getParent() > I2->getParent())
+              return false;
+            InstructionsState S = getSameOpcode({I1, I2});
+            if (S.getOpcode())
+              continue;
+            return I1->getOpcode() < I2->getOpcode();
+          }
+        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+          continue;
+        if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
+          return true;
+        if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
+          return false;
+      }
+      return false;
+    });
+
+    auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+      if (V1 == V2)
+        return true;
+      if (V1->getType() != V2->getType())
+        return false;
+      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+      if (Opcodes1.size() != Opcodes2.size())
+        return false;
+      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+        // Undefs are compatible with any other value.
+        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+          continue;
+        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+            if (I1->getParent() != I2->getParent())
+              return false;
+            InstructionsState S = getSameOpcode({I1, I2});
+            if (!S.getOpcode())
+              return false;
+            continue;
+          }
+        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+          continue;
+        if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
+          return false;
+      }
+      return true;
+    };
 
     // Try to vectorize elements base on their type.
+    SmallVector<Value *, 4> Candidates;
     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
                                            E = Incoming.end();
          IncIt != E;) {
 
-      // Look for the next elements with the same type.
+      // Look for the next elements with the same type, parent and operand
+      // kinds.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+      while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
@@ -7608,17 +8910,25 @@
       unsigned NumElts = (SameTypeIt - IncIt);
       LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
                         << NumElts << ")\n");
-      // The order in which the phi nodes appear in the program does not matter.
-      // So allow tryToVectorizeList to reorder them if it is beneficial. This
-      // is done when there are exactly two elements since tryToVectorizeList
-      // asserts that there are only two values when AllowReorder is true.
-      bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 &&
-          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            /*AllowReorder=*/true)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
-        break;
+      } else if ((NumElts == 1 || NumElts < MinNonPow2ValuesSize.getValue()) &&
+                 (Candidates.empty() ||
+                  Candidates.front()->getType() == (*IncIt)->getType())) {
+        Candidates.append(IncIt, std::next(IncIt, NumElts));
+      }
+      // Final attempt to vectorize phis with the same types.
+      if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
+        if (Candidates.size() > 1 &&
+            tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {
+          // Success start over because instructions might have been changed.
+          HaveVectorizedPhiNodes = true;
+          Changed = true;
+        }
+        Candidates.clear();
       }
 
       // Start over at the next instruction of a different type (or the end).
@@ -7642,7 +8952,8 @@
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second) {
       if (it->use_empty() && KeyNodes.contains(&*it) &&
-          vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+          vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
+                                      it->isTerminator())) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
         Changed = true;
@@ -7701,7 +9012,8 @@
       // Start vectorization of post-process list of instructions from the
       // top-tree instructions to try to vectorize as many instructions as
       // possible.
-      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
+                                                it->isTerminator());
       if (OpsChanged) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
@@ -7823,7 +9135,86 @@
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
                       << it->second.size() << ".\n");
 
-    Changed |= vectorizeStores(it->second, R);
+    // Sort by type, base pointers and values operand. Value operands must be
+    // compatible (have the same opcode, same parent), otherwise it is
+    // definitely not profitable to try to vectorize them.
+    auto &&StoreSorter = [](StoreInst *V, StoreInst *V2) {
+      if (V->getPointerOperandType() < V2->getPointerOperandType())
+        return true;
+      if (V->getPointerOperandType() > V2->getPointerOperandType())
+        return false;
+      // UndefValues are compatible with all other values.
+      if (isa<UndefValue>(V->getValueOperand()) ||
+          isa<UndefValue>(V2->getValueOperand()))
+        return false;
+      if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
+        if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
+          if (I1->getParent() < I2->getParent())
+            return true;
+          if (I1->getParent() > I2->getParent())
+            return false;
+          InstructionsState S = getSameOpcode({I1, I2});
+          if (S.getOpcode())
+            return false;
+          return I1->getOpcode() < I2->getOpcode();
+        }
+      if (isa<Constant>(V->getValueOperand()) &&
+          isa<Constant>(V2->getValueOperand()))
+        return false;
+      return V->getValueOperand()->getValueID() <
+             V2->getValueOperand()->getValueID();
+    };
+
+    llvm::stable_sort(it->second, StoreSorter);
+
+    auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
+      if (V1 == V2)
+        return true;
+      if (V1->getPointerOperandType() != V2->getPointerOperandType())
+        return false;
+      // Undefs are compatible with any other value.
+      if (isa<UndefValue>(V1->getValueOperand()) ||
+          isa<UndefValue>(V2->getValueOperand()))
+        return true;
+      if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
+        if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
+          if (I1->getParent() != I2->getParent())
+            return false;
+          InstructionsState S = getSameOpcode({I1, I2});
+          return S.getOpcode() > 0;
+        }
+      if (isa<Constant>(V1->getValueOperand()) &&
+          isa<Constant>(V2->getValueOperand()))
+        return true;
+      return V1->getValueOperand()->getValueID() ==
+             V2->getValueOperand()->getValueID();
+    };
+
+    // Try to vectorize elements based on their compatibility.
+    for (SmallVector<StoreInst *, 4>::iterator IncIt = it->second.begin(),
+                                               E = it->second.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<StoreInst *, 4>::iterator SameTypeIt = IncIt;
+      Type *EltTy = (*IncIt)->getPointerOperand()->getType();
+
+      while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
+        ++SameTypeIt;
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
+                        << NumElts << ")\n");
+      if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
+          vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
+        // Success start over because instructions might have been changed.
+        Changed = true;
+      }
+
+      // Start over at the next instruction of a different type (or the end).
+      IncIt = SameTypeIt;
+    }
   }
   return Changed;
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
@@ -3,16 +3,17 @@
 
 define void @f1(<2 x i16> %x, i16* %a) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP1]], i16* [[A:%.*]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    store i16 [[T2]], i16* [[A:%.*]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR3]]
 ;
   %t2 = extractelement <2 x i16> %x, i32 0
   %t3 = extractelement <2 x i16> %x, i32 1
@@ -35,15 +36,17 @@
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    store i16 [[T2]], i16* [[A]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR3]]
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
@@ -82,15 +85,17 @@
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    store i16 [[T3]], i16* [[A]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR3]]
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -30,14 +30,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -313,14 +316,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -411,14 +417,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -610,14 +619,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -659,14 +671,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1237,14 +1252,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -30,14 +30,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -313,14 +316,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -411,14 +417,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -610,14 +619,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -659,14 +671,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1182,10 +1197,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #2
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) [[ATTR2:#.*]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #2
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) [[ATTR2]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
@@ -1237,14 +1252,17 @@
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP5]], i32 1
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP6]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; NOACCELERATE-NEXT:    [[TMP7:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP7]], i32 3
 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
@@ -1270,10 +1288,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #3
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) [[ATTR3:#.*]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #3
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) [[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -15,19 +15,25 @@
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
-; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]]
-; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
 ;
@@ -65,22 +71,26 @@
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[A0]]
-; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
 ; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]]
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]]
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]]
-; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C3:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
 ; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
-; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-32 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -203,10 +203,14 @@
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <2 x i8>*), align 2
+; MAX-COST-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1
+; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[P0]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0
+; MAX-COST-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[TMP1]], i32 2
+; MAX-COST-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i8> [[TMP5]], <i8 0, i8 0, i8 0, i8 poison>
 ; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
 ; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
 ; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
@@ -220,21 +224,18 @@
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 poison>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 poison>
+; MAX-COST-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; MAX-COST-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[REDUCTION_NORMALIZATION]])
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP8]], -5
+; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[OP_EXTRA]], [[P25]]
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
 ; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
 ; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
@@ -9,7 +9,7 @@
 
 define <2 x float> @insertelement-fixed-vector() {
 ; CHECK-LABEL: @insertelement-fixed-vector(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
@@ -9,7 +9,7 @@
 
 define <2 x float> @insertelement-fixed-vector() {
 ; CHECK-LABEL: @insertelement-fixed-vector(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -6,16 +6,16 @@
 
 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: @build_vec_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -74,15 +74,17 @@
 
 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
@@ -114,16 +116,16 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -145,22 +147,21 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
-; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT:    [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -184,25 +185,19 @@
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0
-; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -230,14 +225,16 @@
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -6,16 +6,16 @@
 
 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: @build_vec_v2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -74,15 +74,17 @@
 
 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
@@ -114,16 +116,16 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -145,22 +147,21 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
-; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT:    [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -184,25 +185,19 @@
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
-; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -230,14 +225,16 @@
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -8,21 +8,21 @@
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ poison, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], <i16 8, i16 8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    [[I12:%.*]] = zext i1 undef to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <2 x i64> undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <2 x i64> poison, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult <2 x i32> undef, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult <2 x i32> poison, [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i1> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb25:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -242,16 +242,16 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v3i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[SHUFFLE:%.*]] = shufflevector <3 x i16> [[ARG0:%.*]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX8-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <3 x i16> [[ARG1:%.*]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[SHUFFLE]], <2 x i16> [[SHUFFLE1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[TMP3]], i64 0
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
+; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[TMP1]], i64 0
+; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP2]], i64 1
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -294,13 +294,13 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[SHUFFLE]], <2 x i16> [[SHUFFLE1]])
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP2]])
+; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -242,16 +242,16 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v3i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[SHUFFLE:%.*]] = shufflevector <3 x i16> [[ARG0:%.*]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX8-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <3 x i16> [[ARG1:%.*]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[SHUFFLE]], <2 x i16> [[SHUFFLE1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
+; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i64 0
+; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP2]], i64 1
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -294,13 +294,13 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[SHUFFLE]], <2 x i16> [[SHUFFLE1]])
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP2]])
+; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -9,21 +9,22 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADD277:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
 ; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
 ; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0) to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> poison, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP8]], <i32 6, i32 6, i32 6, i32 6>
 ; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefix=CHECK
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 | FileCheck %s --check-prefix=CHECK
 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION
 
 define void @Test(i32) {
@@ -7,7 +7,7 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP14:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
@@ -39,33 +39,29 @@
 ; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
 ; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
 ; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_EXTRA26]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_EXTRA26]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP13]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
-; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
-; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]]
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef>
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 poison, i32 poison>
+; FORCE_REDUCTION-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
@@ -96,13 +92,12 @@
 ; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
 ; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_41]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[VAL_41]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = and <2 x i32> [[TMP6]], [[TMP7]]
+; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
+; FORCE_REDUCTION-NEXT:    [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
@@ -7,31 +7,47 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
+; SSE-LABEL: @ceil_floor(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; SSE-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[SHRINK_SHUFFLE]])
+; SSE-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; SSE-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP6]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
 ; CHECK-LABEL: @ceil_floor(
 ; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; CHECK-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; CHECK-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; CHECK-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
-; CHECK-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; CHECK-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
-; CHECK-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
-; CHECK-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
-; CHECK-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[SHRINK_SHUFFLE]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R5:%.*]] = shufflevector <8 x float> [[R2]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
@@ -7,31 +7,47 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
+; SSE-LABEL: @ceil_floor(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; SSE-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[SHRINK_SHUFFLE]])
+; SSE-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; SSE-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP6]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
 ; CHECK-LABEL: @ceil_floor(
 ; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; CHECK-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; CHECK-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; CHECK-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
-; CHECK-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; CHECK-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
-; CHECK-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
-; CHECK-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
-; CHECK-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[SHRINK_SHUFFLE]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R5:%.*]] = shufflevector <8 x float> [[R2]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -8,30 +8,11 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[SHUFFLE]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -81,26 +62,24 @@
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[SHUFFLE]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
@@ -135,30 +114,11 @@
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[SHUFFLE]] to <4 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
@@ -335,24 +295,24 @@
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
 ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
 ; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[SHUFFLE]] to <2 x float>
 ; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
 ; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
 ; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
 ; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
 ; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -8,30 +8,11 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[SHUFFLE]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -81,26 +62,24 @@
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[SHUFFLE]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
@@ -135,30 +114,11 @@
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[SHUFFLE]] to <4 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
@@ -335,24 +295,24 @@
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
 ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
 ; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[SHUFFLE]] to <2 x float>
 ; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
 ; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
 ; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
 ; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
 ; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -143,14 +143,15 @@
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[SHUFFLE]], <float 2.000000e+00, float 1.000000e+00>
 ; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
 ; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
 ; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -143,14 +143,15 @@
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[SHUFFLE]], <float 2.000000e+00, float 1.000000e+00>
 ; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
 ; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
 ; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -110,22 +110,22 @@
 ; AVX1-LABEL: @ashr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
 ; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
 ; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
 ; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
 ; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32(
@@ -177,19 +177,19 @@
 
 define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-LABEL: @ashr_shl_v8i32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32_const(
@@ -293,25 +293,18 @@
 ; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX2-NEXT:    ret <8 x i32> [[R7]]
@@ -321,25 +314,18 @@
 ; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX512-NEXT:    ret <8 x i32> [[R7]]
@@ -412,26 +398,56 @@
 }
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
-; CHECK-LABEL: @sdiv_v8i32_undefs(
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
-; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
-; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
-; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
-; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
-; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @sdiv_v8i32_undefs(
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @sdiv_v8i32_undefs(
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; AVX1-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 16, i32 16, i32 4, i32 8>
+; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @sdiv_v8i32_undefs(
+; AVX2-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 4, i32 4, i32 8, i32 16, i32 4, i32 4, i32 8, i32 16>
+; AVX2-NEXT:    ret <8 x i32> [[TMP1]]
+;
+; AVX512-LABEL: @sdiv_v8i32_undefs(
+; AVX512-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 4, i32 4, i32 8, i32 16, i32 4, i32 4, i32 8, i32 16>
+; AVX512-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -110,22 +110,22 @@
 ; AVX1-LABEL: @ashr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
 ; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
 ; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
 ; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32(
@@ -177,19 +177,19 @@
 
 define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-LABEL: @ashr_shl_v8i32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32_const(
@@ -293,25 +293,18 @@
 ; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX2-NEXT:    ret <8 x i32> [[R7]]
@@ -321,25 +314,18 @@
 ; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX512-NEXT:    ret <8 x i32> [[R7]]
@@ -412,26 +398,56 @@
 }
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
-; CHECK-LABEL: @sdiv_v8i32_undefs(
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
-; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
-; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
-; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
-; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
-; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i32 1
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @sdiv_v8i32_undefs(
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @sdiv_v8i32_undefs(
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; AVX1-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 16, i32 16, i32 4, i32 8>
+; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @sdiv_v8i32_undefs(
+; AVX2-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 4, i32 4, i32 8, i32 16, i32 4, i32 4, i32 8, i32 16>
+; AVX2-NEXT:    ret <8 x i32> [[TMP1]]
+;
+; AVX512-LABEL: @sdiv_v8i32_undefs(
+; AVX512-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 4, i32 4, i32 8, i32 16, i32 4, i32 4, i32 8, i32 16>
+; AVX512-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -81,18 +81,16 @@
 
 define i8 @i(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @i(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -110,18 +108,16 @@
 
 define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @j(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -139,18 +135,16 @@
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -172,16 +166,15 @@
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 ; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 ; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    ret i8 [[TMP6]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -81,18 +81,16 @@
 
 define i8 @i(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @i(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -110,18 +108,16 @@
 
 define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @j(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -139,18 +135,16 @@
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -172,16 +166,15 @@
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 ; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 ; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    ret i8 [[TMP6]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -237,23 +237,22 @@
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
 ; CHECK-LABEL: @fcmp_ord_uno_v4i32(
 ; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load float, float* [[P1]], align 4
-; CHECK-NEXT:    [[B2:%.*]] = load float, float* [[P2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
 ; CHECK-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
-; CHECK-NEXT:    [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <2 x float> [[TMP2]], [[SHRINK_SHUFFLE]]
 ; CHECK-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
 ; CHECK-NEXT:    [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
-; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
-; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP5]], i32 2
 ; CHECK-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
 ; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -237,23 +237,22 @@
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
 ; CHECK-LABEL: @fcmp_ord_uno_v4i32(
 ; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load float, float* [[P1]], align 4
-; CHECK-NEXT:    [[B2:%.*]] = load float, float* [[P2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
 ; CHECK-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
-; CHECK-NEXT:    [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <2 x float> [[TMP2]], [[SHRINK_SHUFFLE]]
 ; CHECK-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
 ; CHECK-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
-; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
-; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP5]], i32 2
 ; CHECK-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
 ; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -51,26 +51,26 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @splat(
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
-; AVX-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
-; AVX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[A:%.*]], i32 0
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; AVX-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 1
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 2
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 3
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 4
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 5
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 6
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 7
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 8
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 9
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 10
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 11
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 12
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 13
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> [[TMP16]], i8 [[C]], i32 14
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[C]], i32 15
+; AVX-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[SHUFFLE]], [[TMP18]]
 ; AVX-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
 ; AVX-NEXT:    ret void
 ;
@@ -141,7 +141,7 @@
 ; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
 ; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
 ; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
-; AVX-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
+; AVX-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP9]]
 ; AVX-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -165,13 +165,28 @@
 
 define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
+; CHECK-NEXT:    [[Y3:%.*]] = extractelement <4 x i32> [[Y]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X1]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y3]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[Y0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y2]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[Y1]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[X3]], i32 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <8 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[REDUCTION_NORMALIZATION]])
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP12]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -55,35 +55,32 @@
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; AVX-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
 ; AVX-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
 ; AVX-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
-; AVX-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; AVX-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; AVX-NEXT:    [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
+; AVX-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; AVX-NEXT:    [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
+; AVX-NEXT:    [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
 ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
 ; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; AVX:       for.end:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -34,23 +34,22 @@
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double undef, i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul fast <2 x double> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[LABEL:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> poison, double [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[TMP17]], double [[TMP18]], i32 1
 ; CHECK-NEXT:    br label [[LABEL]]
 ; CHECK:       label:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP20]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP19]], [[BB2]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -131,10 +131,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -97,10 +97,10 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], poison
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], poison
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], poison
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,25 +4,13 @@
 define i32 @crash_reordering_undefs() {
 ; CHECK-LABEL: @crash_reordering_undefs(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT:    [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD1:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]]
-; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[ADD5]], undef
-; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[ADD6]], undef
-; CHECK-NEXT:    [[ADD8:%.*]] = add i32 [[ADD7]], undef
-; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]]
-; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[ADD10]], undef
-; CHECK-NEXT:    ret i32 [[ADD11]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> poison)
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP0]], undef
+; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], undef
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], undef
+; CHECK-NEXT:    ret i32 [[OP_EXTRA4]]
 ;
 entry:
   %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -31,15 +31,14 @@
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.400000e+02, double 1.400000e+02>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> poison, [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
@@ -111,14 +110,14 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double undef, double poison>, double undef, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double poison, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> poison, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> poison, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> poison, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> poison, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> poison, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> poison, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> poison, [[TMP6]]
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
@@ -25,27 +25,36 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1
 ; CHECK-NEXT:    br label [[TMP7:%.*]]
-; CHECK:         [[TMP8:%.*]] = phi <2 x double> [ <double 1.800000e+01, double 2.800000e+01>, [[TMP0]] ], [ [[TMP11:%.*]], [[TMP21:%.*]] ], [ [[TMP11]], [[TMP18:%.*]] ], [ [[TMP11]], [[TMP18]] ]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ <double 1.800000e+01, double 2.800000e+01>, [[TMP0]] ], [ [[TMP11:%.*]], [[TMP21:%.*]] ], [ [[TMP11]], [[TMP18:%.*]] ], [ [[TMP11]], [[TMP18]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP11]] = load <2 x double>, <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[TMP12:%.*]], label [[TMP13:%.*]]
-; CHECK:         ret void
-; CHECK:         [[TMP14:%.*]] = bitcast double* [[TMP5]] to <2 x double>*
+; CHECK:       12:
+; CHECK-NEXT:    ret void
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP5]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[TMP15:%.*]], label [[TMP16:%.*]]
-; CHECK:         br label [[TMP16]]
-; CHECK:         br i1 undef, label [[TMP17:%.*]], label [[TMP18]]
-; CHECK:         unreachable
-; CHECK:         [[TMP19:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; CHECK:       15:
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
+; CHECK-NEXT:    br i1 undef, label [[TMP17:%.*]], label [[TMP18]]
+; CHECK:       17:
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
 ; CHECK-NEXT:    switch i32 undef, label [[TMP21]] [
 ; CHECK-NEXT:    i32 32, label [[TMP7]]
 ; CHECK-NEXT:    i32 103, label [[TMP7]]
 ; CHECK-NEXT:    ]
-; CHECK:         br i1 undef, label [[TMP7]], label [[TMP22:%.*]]
-; CHECK:         unreachable
+; CHECK:       21:
+; CHECK-NEXT:    br i1 undef, label [[TMP7]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    unreachable
 ;
   %1 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0
   %2 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -18,21 +18,15 @@
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[SHUFFLE]], <double 4.000000e+00, double 3.000000e+00, double 4.000000e+00, double poison>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
@@ -54,14 +54,11 @@
 ; CHECK-LABEL: @fextr2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LD:%.*]] = load <4 x double>, <4 x double>* undef, align 32
-; CHECK-NEXT:    [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0
-; CHECK-NEXT:    [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 5.500000e+00, double 6.600000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 5.500000e+00, double 6.600000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
@@ -46,13 +46,13 @@
 ; CHECK-NEXT:    ret float [[X0]]
 ;
 ; THRESH1-LABEL: @f_used_out_of_tree(
-; THRESH1-NEXT:    [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
-; THRESH1-NEXT:    [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
-; THRESH1-NEXT:    [[X0X0:%.*]] = fmul float [[X0]], [[X0]]
-; THRESH1-NEXT:    [[X1X1:%.*]] = fmul float [[X1]], [[X1]]
-; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[X0X0]], [[X1X1]]
+; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; THRESH1-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; THRESH1-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH1-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
 ; THRESH1-NEXT:    store float [[ADD]], float* @a, align 4
-; THRESH1-NEXT:    ret float [[X0]]
+; THRESH1-NEXT:    ret float [[TMP1]]
 ;
 ; THRESH2-LABEL: @f_used_out_of_tree(
 ; THRESH2-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
@@ -85,7 +85,7 @@
 ; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; THRESH1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; THRESH1-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; THRESH1-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[X]], [[TMP3]]
+; THRESH1-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]]
 ; THRESH1-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; THRESH1-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
@@ -95,7 +95,7 @@
 ; THRESH2-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; THRESH2-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; THRESH2-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; THRESH2-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[X]], [[TMP3]]
+; THRESH2-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[X]]
 ; THRESH2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; THRESH2-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; THRESH2-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -194,32 +194,73 @@
 }
 
 define void @fptosi_8f64_8i8() #0 {
-; CHECK-LABEL: @fptosi_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptosi_8f64_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f64_8i8(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; AVX256NODQ-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f64_8i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f64_8i8(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; AVX256DQ-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX256DQ-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -427,32 +468,46 @@
 }
 
 define void @fptosi_8f32_8i8() #0 {
-; CHECK-LABEL: @fptosi_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptosi_8f32_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f32_8i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f32_8i8(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; AVX256DQ-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX256DQ-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -194,32 +194,73 @@
 }
 
 define void @fptosi_8f64_8i8() #0 {
-; CHECK-LABEL: @fptosi_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptosi_8f64_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f64_8i8(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; AVX256NODQ-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f64_8i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f64_8i8(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; AVX256DQ-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX256DQ-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -427,32 +468,46 @@
 }
 
 define void @fptosi_8f32_8i8() #0 {
-; CHECK-LABEL: @fptosi_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptosi_8f32_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f32_8i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f32_8i8(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; AVX256DQ-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX256DQ-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -239,32 +239,66 @@
 }
 
 define void @fptoui_8f64_8i8() #0 {
-; CHECK-LABEL: @fptoui_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptoui_8f64_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f64_8i8(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
+; AVX256NODQ-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX-LABEL: @fptoui_8f64_8i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8>
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -517,32 +551,39 @@
 }
 
 define void @fptoui_8f32_8i8() #0 {
-; CHECK-LABEL: @fptoui_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptoui_8f32_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptoui_8f32_8i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i8>
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[SHUFFLE]], <16 x i8>* bitcast ([64 x i8]* @dst8 to <16 x i8>*), i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+; AVX-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell < %s -slp-min-non-power2-values-size=2 | FileCheck %s
 @e = dso_local local_unnamed_addr global i32 0, align 4
 @f = dso_local local_unnamed_addr global i32 0, align 4
 
@@ -11,36 +11,33 @@
 ; CHECK-NEXT:    [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 1, i64 1>
-; CHECK-NEXT:    switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32*> [ [[TMP16:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ poison, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32*> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 1, i64 1, i64 1, i64 poison>
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [
 ; CHECK-NEXT:    i32 2, label [[SW_BB:%.*]]
 ; CHECK-NEXT:    i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1
-; CHECK-NEXT:    store i32 [[TMP7]], i32* [[TMP9]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i32* [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 2
+; CHECK-NEXT:    store i32 [[TMP8]], i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 2, i64 2, i64 2, i64 poison>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
-; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
-; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint i32* [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 2, i64 2, i64 2, i64 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1
+; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP15]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
-; CHECK-NEXT:    [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP16]] = phi <4 x i32*> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP14]], [[SW_BB6]] ], [ [[TMP10]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll
@@ -16,9 +16,9 @@
 define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[N:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -761,44 +761,29 @@
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
 ; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]]
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
-; AVX-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]]
-; AVX-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP10]], i32 [[TMP5]]
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[OP_EXTRA1]], [[TMP11]]
-; AVX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA1]], i32 [[TMP11]]
-; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4
-; AVX-NEXT:    store i32 [[TMP14]], i32* @var, align 8
-; AVX-NEXT:    ret i32 [[TMP13]]
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <8 x i32>*), i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; AVX-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; AVX-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; AVX-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP7]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; AVX-NEXT:    store i32 [[TMP8]], i32* @var, align 8
+; AVX-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
 ; THRESH-LABEL: @maxi8_mutiple_uses(
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
 ; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; THRESH-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; THRESH-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
-; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; THRESH-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1
-; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1
-; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP9]], [[TMP11]]
-; THRESH-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]]
-; THRESH-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
-; THRESH-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP15]], [[TMP14]]
-; THRESH-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP15]], i32 [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[OP_EXTRA1]], [[TMP16]]
-; THRESH-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[OP_EXTRA1]], i32 [[TMP16]]
-; THRESH-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; THRESH-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4
-; THRESH-NEXT:    store i32 [[TMP20]], i32* @var, align 8
-; THRESH-NEXT:    ret i32 [[TMP18]]
+; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
+; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
+; THRESH-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <8 x i32>*), i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; THRESH-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; THRESH-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]]
+; THRESH-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP8]], i32 [[TMP6]]
+; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; THRESH-NEXT:    store i32 [[TMP9]], i32* @var, align 8
+; THRESH-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -862,16 +847,11 @@
 ; AVX-NEXT:    br label [[PP:%.*]]
 ; AVX:       pp:
 ; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
-; AVX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
-; AVX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
-; AVX-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
-; AVX-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <8 x i32>*), i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; AVX-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; AVX-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; AVX-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP7]], i32 [[TMP5]]
 ; AVX-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
 ; THRESH-LABEL: @maxi8_wrong_parent(
@@ -881,24 +861,12 @@
 ; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    br label [[PP:%.*]]
 ; THRESH:       pp:
-; THRESH-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; THRESH-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; THRESH-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; THRESH-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
-; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[TMP12]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
-; THRESH-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1
-; THRESH-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1
-; THRESH-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]]
-; THRESH-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
-; THRESH-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
-; THRESH-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP21]], [[TMP20]]
-; THRESH-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP21]], i32 [[TMP20]]
+; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
+; THRESH-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <8 x i32>*), i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; THRESH-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; THRESH-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]]
+; THRESH-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP8]], i32 [[TMP6]]
 ; THRESH-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -218,42 +218,27 @@
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_no_users(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> poison, float [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE6:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE7:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[SHUFFLE5]], zeroinitializer
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[SHRINK_SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[SHUFFLE6]], <2 x float> [[SHUFFLE7]]
+; CHECK-NEXT:    [[SHRINK_SHUFFLE2:%.*]] = shufflevector <4 x float> [[SHUFFLE1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHRINK_SHUFFLE4:%.*]] = shufflevector <4 x float> [[SHUFFLE3]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[SHRINK_SHUFFLE2]], <2 x float> [[SHRINK_SHUFFLE4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP8]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -218,42 +218,27 @@
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_no_users(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE6:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE7:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[SHUFFLE5]], zeroinitializer
+; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[SHRINK_SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[SHUFFLE6]], <2 x float> [[SHUFFLE7]]
+; CHECK-NEXT:    [[SHRINK_SHUFFLE2:%.*]] = shufflevector <4 x float> [[SHUFFLE1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SHRINK_SHUFFLE4:%.*]] = shufflevector <4 x float> [[SHUFFLE3]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[TMP2]], <2 x float> [[SHRINK_SHUFFLE2]], <2 x float> [[SHRINK_SHUFFLE4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP8]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -11,12 +11,9 @@
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP6]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> <i32 poison, i32 ptrtoint (i32 ()* @fn1 to i32), i32 ptrtoint (i32 ()* @fn1 to i32), i32 8>, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
@@ -52,7 +52,7 @@
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP26]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ poison, [[ENTRY]] ], [ [[TMP26]], [[FOR_INC]] ]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -54,15 +54,16 @@
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
   %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
@@ -115,13 +116,16 @@
 ; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
 ; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[T8]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T4]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP6]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP6]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[T15]]
 ;
   %t0 = bitcast <4 x float>* %x to i64*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -54,15 +54,16 @@
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
   %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
@@ -115,13 +116,16 @@
 ; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
 ; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[T8]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T4]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP6]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP6]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[T15]]
 ;
   %t0 = bitcast <4 x float>* %x to i64*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -37,7 +37,7 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    ret void
@@ -104,7 +104,7 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    ret void
@@ -175,7 +175,7 @@
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
 ; CHECK-NEXT:    ret void
@@ -229,34 +229,37 @@
 define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
 ; CHECK-LABEL: @lookahead_external_uses(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
-; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[A]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
+; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B2]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
+; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
-; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    store double [[TMP12]], double* [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -319,36 +322,38 @@
 ; CHECK-LABEL: @lookahead_limit_users_budget(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
+; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[B]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
+; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
 ; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT1:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT2:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP14]], double* [[EXT3:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    store double [[TMP12]], double* [[EXT1:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP12]], double* [[EXT2:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP12]], double* [[EXT3:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], double* [[EXT4:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], double* [[EXT5:%.*]], align 8
 ; CHECK-NEXT:    ret void
@@ -588,6 +593,7 @@
 ; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
 ; CHECK-NEXT:    [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
@@ -598,15 +604,15 @@
 ; CHECK-NEXT:    [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRA1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[EXTRB1]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP5]], [[TMP12]]
 ; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
 ; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -18,12 +18,13 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1
 ; CHECK-NEXT:    [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1
 ; CHECK-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
@@ -56,35 +57,22 @@
 ;        ensure correctness.
 ;
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
-; SSE-LABEL: @PR31243_sext(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; SSE-NEXT:    ret i8 [[TMP8]]
-;
-; AVX-LABEL: @PR31243_sext(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
-; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
-; AVX-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
-; AVX-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    ret i8 [[TMP8]]
+; CHECK-LABEL: @PR31243_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
 entry:
   %tmp0 = sext i8 %v0 to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -slp-threshold=-200 -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -slp-threshold=-200 -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -S -slp-min-non-power2-stores-size=1 -slp-min-non-power2-values-size=1 | FileCheck %s
 
 define void @test_add_sdiv(i32 *%arr1, i32 *%arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: @test_add_sdiv(
@@ -12,22 +12,21 @@
 ; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1
 ; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2
 ; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3
-; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]]
-; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]]
-; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]]
-; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]]
-; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
-; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
+; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[GEP1_0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 undef>
 ; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
-; CHECK-NEXT:    [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
-; CHECK-NEXT:    [[RES0:%.*]] = add nsw i32 [[V0]], [[Y0]]
-; CHECK-NEXT:    [[RES1:%.*]] = add nsw i32 [[V1]], [[Y1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], <i32 1146, i32 146, i32 0, i32 poison>
 ; CHECK-NEXT:    [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
-; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
-; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]]
-; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]]
-; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]]
-; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]], align 4
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[GEP2_0]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -76,22 +75,21 @@
 ; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1
 ; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2
 ; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3
-; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]]
-; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]]
-; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]]
-; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]]
-; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
-; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
-; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[GEP1_0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef)
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], <i32 1146, i32 146, i32 42, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
 ; CHECK-NEXT:    [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
-; CHECK-NEXT:    [[RES0:%.*]] = urem i32 [[V0]], [[Y0]]
-; CHECK-NEXT:    [[RES1:%.*]] = urem i32 [[V1]], [[Y1]]
-; CHECK-NEXT:    [[RES2:%.*]] = urem i32 [[V2]], [[Y2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
-; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]]
-; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]]
-; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]]
-; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[GEP2_0]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx -slp-min-non-power2-stores-size=5 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 
@@ -12,7 +12,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -142,16 +142,13 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -183,11 +180,11 @@
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -218,16 +215,13 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -264,32 +258,24 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    store float [[MUL45]], float* [[ARRAYIDX31]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>*
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995
+; CHECK-NEXT:    [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
@@ -348,7 +334,7 @@
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -13,31 +13,28 @@
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef
 ; CHECK-NEXT:    [[SHR15:%.*]] = ashr i32 [[SUB14]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[SHR15]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SUB14]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], <i32 0, i32 -1, i32 -5, i32 -9>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef
-; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SHUFFLE1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], poison
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> poison
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]]
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -140,49 +140,47 @@
 define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x float> undef)
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
-; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <8 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[SHRINK_SHUFFLE:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[SHUFFLE1]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[SHUFFLE1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
-; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>*
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> poison, float [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP8]], float [[TMP9]], i32 1
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[SHUFFLE2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP10]], float [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[SHUFFLE2]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP13]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP15]], i32 4
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <8 x float> [[TMP16]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float poison, float poison, float poison>
+; CHECK-NEXT:    [[TMP18]] = fadd <8 x float> [[TMP2]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHRINK_SHUFFLE]] = shufflevector <4 x float> [[SHUFFLE1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[TMP18]], i32 4
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll
@@ -14,12 +14,12 @@
 ; CHECK-LABEL: @Rf_GReset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* @d, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP1]]
 ; CHECK-NEXT:    br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], poison
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], poison
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]]
@@ -55,12 +55,12 @@
 ; CHECK-LABEL: @Rf_GReset_unary_fneg(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* @d, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x double> [[TMP1]]
 ; CHECK-NEXT:    br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], poison
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], poison
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
@@ -12,12 +12,12 @@
 ; CHECK-NEXT:    invoke void @foo()
 ; CHECK-NEXT:    to label [[DONE:%.*]] unwind label [[LPAD]]
 ; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ undef, [[ENTRY:%.*]] ], [ undef, [[INNER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ poison, [[ENTRY:%.*]] ], [ poison, [[INNER]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = landingpad { i8*, i32 }
 ; CHECK-NEXT:    catch i8* null
 ; CHECK-NEXT:    br label [[DONE]]
 ; CHECK:       done:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ undef, [[INNER]] ], [ [[TMP0]], [[LPAD]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ poison, [[INNER]] ], [ [[TMP0]], [[LPAD]] ]
 ; CHECK-NEXT:    ret void
 ;
   bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -69,32 +69,27 @@
 ; SSE-LABEL: @pr35497(
 ; SSE-NEXT:  entry:
 ; SSE-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
-; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
-; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
 ; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; SSE-NEXT:    [[AND_1:%.*]] = shl i64 undef, 2
-; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
-; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
-; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
-; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
-; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], poison
 ; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
-; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
-; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
-; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
-; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
-; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
-; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
@@ -103,11 +98,11 @@
 ; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; AVX-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], poison
 ; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
 ; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll
@@ -13,9 +13,10 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
@@ -13,9 +13,10 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP6]], i64 0
@@ -64,13 +65,13 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1
-; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] %StructIn1, 0
-; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] %StructIn3, 1
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1
 ; CHECK-NEXT:    ret [2 x %StructTy] [[RET1]]
 ;
   %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
@@ -110,13 +111,13 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1
-; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] %StructIn1, 0
-; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] %StructIn3, 1
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1
 ; CHECK-NEXT:    ret { [[STRUCTTY]], [[STRUCTTY]] } [[RET1]]
 ;
   %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0
@@ -159,8 +160,8 @@
 ; CHECK-NEXT:    [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01
 ; CHECK-NEXT:    [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01
 ; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[FADD1]], 1
-; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] %StructIn1, 0
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[FADD1]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] [[STRUCTIN1]], 0
 ; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET0]], float [[FADD2]], 1
 ; CHECK-NEXT:    [[RET2:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET1]], float [[FADD3]], 2
 ; CHECK-NEXT:    ret { [[STRUCTTY]], float, float } [[RET2]]
@@ -207,25 +208,25 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] %StructIn0, i16 [[TMP5]], 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN0]], i16 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] %StructIn2, i16 [[TMP7]], 1
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN2]], i16 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
 ; CHECK-NEXT:    [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] %StructIn4, i16 [[TMP9]], 1
+; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN4]], i16 [[TMP9]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
 ; CHECK-NEXT:    [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] %StructIn6, i16 [[TMP11]], 1
-; CHECK-NEXT:    [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] %StructIn1, 0
-; CHECK-NEXT:    [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In0, [[STRUCT1TY]] %StructIn3, 1
-; CHECK-NEXT:    [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] %StructIn5, 0
-; CHECK-NEXT:    [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In2, [[STRUCT1TY]] %StructIn7, 1
-; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] %Struct2In1, 0
-; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] %Struct2In3, 1
+; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN6]], i16 [[TMP11]], 1
+; CHECK-NEXT:    [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] [[STRUCTIN1]], 0
+; CHECK-NEXT:    [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN0]], [[STRUCT1TY]] [[STRUCTIN3]], 1
+; CHECK-NEXT:    [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] [[STRUCTIN5]], 0
+; CHECK-NEXT:    [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN2]], [[STRUCT1TY]] [[STRUCTIN7]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] [[STRUCT2IN1]], 0
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] [[STRUCT2IN3]], 1
 ; CHECK-NEXT:    ret { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET1]]
 ;
   %GEP0 = getelementptr inbounds i16, i16* %Ptr, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -11,21 +11,16 @@
 
 define void @foo() {
 ; SSE-LABEL: @foo(
-; SSE-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
-; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16
-; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8
-; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @b to <4 x i32>*), align 16
+; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+; SSE-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([8 x i32]* @a to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4) to <4 x i32>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @foo(
-; AVX-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> <i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2)>, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @b to <4 x i32>*), align 16
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 0, i32 2, i32 0, i32 2>
 ; AVX-NEXT:    store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -1,27 +1,76 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
-; CHECK-LABEL: @gather_load(
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @gather_load(
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 7, i32 4>
+; SSE-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load(
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load(
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load(
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX512-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
   %4 = load i32, i32* %1, align 4, !tbaa !2
@@ -46,7 +95,7 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
@@ -68,41 +117,45 @@
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
-; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
-; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
+; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
 ; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
@@ -175,72 +228,64 @@
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; AVX-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 5
+; AVX-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
+; AVX-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 6
+; AVX-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP15]], i32 7
+; AVX-NEXT:    [[TMP28:%.*]] = add <8 x i32> [[TMP27]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP29]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP7]], i32 2
+; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 5
+; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
+; AVX2-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 6
+; AVX2-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP15]], i32 7
+; AVX2-NEXT:    [[TMP28:%.*]] = add <8 x i32> [[TMP27]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP29]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_3(
@@ -248,28 +293,13 @@
 ; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> <i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21, i64 poison>
+; AVX512-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>, <8 x i32> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>), [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
@@ -356,102 +386,79 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32* [[T26]] to <4 x i32>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 5
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 6
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
 ; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[T26]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 5
+; AVX2-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 6
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX2-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_4(
 ; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> <i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21, i64 poison>
 ; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>, <8 x i32> undef), [[TBAA0]]
 ; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; AVX512-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 poison>
 ; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <8 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>), [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
@@ -502,78 +509,174 @@
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i32 2
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i32 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i32 3
+; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>*
+; SSE-NEXT:    [[TMP35:%.*]] = load <4 x float>, <4 x float>* [[TMP34]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>*
+; SSE-NEXT:    [[TMP38:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP35]], i32 3
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP44]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP35]], i32 0
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP46]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP40]], i32 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i32 0
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP38]], i32 3
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP50]], i32 1
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP38]], i32 0
+; SSE-NEXT:    [[TMP53:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP52]], i32 2
+; SSE-NEXT:    [[TMP54:%.*]] = insertelement <4 x float> [[TMP53]], float [[TMP42]], i32 3
+; SSE-NEXT:    [[TMP55:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP54]]
+; SSE-NEXT:    [[TMP56:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP55]], <4 x float>* [[TMP56]], align 4, [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; AVX-NEXT:    [[TMP18:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
+; AVX-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; AVX-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP11]], i32 2
+; AVX-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 3
+; AVX-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP36]], i32 4
+; AVX-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP23]], i32 3
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP38]], i32 5
+; AVX-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP23]], i32 0
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP40]], i32 6
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP28]], i32 7
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP9]], i32 1
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP13]], i32 2
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP16]], i32 3
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP20]], i32 4
+; AVX-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP26]], i32 3
+; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 5
+; AVX-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP26]], i32 0
+; AVX-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 6
+; AVX-NEXT:    [[TMP52:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP30]], i32 7
+; AVX-NEXT:    [[TMP53:%.*]] = fdiv <8 x float> [[TMP42]], [[TMP52]]
+; AVX-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX-NEXT:    store <8 x float> [[TMP53]], <8 x float>* [[TMP54]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; AVX2-NEXT:    [[TMP18:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
+; AVX2-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; AVX2-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP11]], i32 2
+; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 3
+; AVX2-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP36]], i32 4
+; AVX2-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP23]], i32 3
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP38]], i32 5
+; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP23]], i32 0
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP40]], i32 6
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP28]], i32 7
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP9]], i32 1
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP13]], i32 2
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP16]], i32 3
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP20]], i32 4
+; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP26]], i32 3
+; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 5
+; AVX2-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP26]], i32 0
+; AVX2-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 6
+; AVX2-NEXT:    [[TMP52:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP30]], i32 7
+; AVX2-NEXT:    [[TMP53:%.*]] = fdiv <8 x float> [[TMP42]], [[TMP52]]
+; AVX2-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX2-NEXT:    store <8 x float> [[TMP53]], <8 x float>* [[TMP54]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -1,27 +1,76 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
 
 define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
-; CHECK-LABEL: @gather_load(
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @gather_load(
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 7, i32 4>
+; SSE-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, [[TBAA0]]
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @gather_load(
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @gather_load(
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @gather_load(
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX512-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX512-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
+; AVX512-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX512-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
+; AVX512-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, i32* %1, i64 1
   %4 = load i32, i32* %1, align 4, !tbaa !2
@@ -46,7 +95,7 @@
 define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
 ; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 ; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
@@ -68,41 +117,45 @@
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
-; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
-; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
+; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
-; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
+; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_2(
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
+; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
 ; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
@@ -175,72 +228,64 @@
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; AVX-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 5
+; AVX-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
+; AVX-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 6
+; AVX-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP15]], i32 7
+; AVX-NEXT:    [[TMP28:%.*]] = add <8 x i32> [[TMP27]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP29]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP7]], i32 2
+; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP19]], i32 3
+; AVX2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP21]], i32 4
+; AVX2-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP23]], i32 5
+; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
+; AVX2-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP25]], i32 6
+; AVX2-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP15]], i32 7
+; AVX2-NEXT:    [[TMP28:%.*]] = add <8 x i32> [[TMP27]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP29]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_3(
@@ -248,28 +293,13 @@
 ; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
-; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i32 0
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP6]], <8 x i32*> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> <i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21, i64 poison>
+; AVX512-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>, <8 x i32> undef), [[TBAA0]]
+; AVX512-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>), [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load i32, i32* %1, align 4, !tbaa !2
@@ -356,102 +386,79 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32* [[T26]] to <4 x i32>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 5
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 6
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
+; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
 ; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[T26]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T7]], i32 1
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T11]], i32 2
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 5
+; AVX2-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 6
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T31]], i32 7
+; AVX2-NEXT:    [[TMP17:%.*]] = add <8 x i32> [[TMP16]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    [[TMP18:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_4(
 ; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
-; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
-; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32*> poison, i32* [[T1:%.*]], i32 0
+; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32*> [[TMP1]], <8 x i32*> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> <i64 11, i64 4, i64 15, i64 18, i64 9, i64 6, i64 21, i64 poison>
 ; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
-; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>, <8 x i32> undef), [[TBAA0]]
 ; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
+; AVX512-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 poison>
 ; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
-; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <8 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>), [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %t5 = getelementptr inbounds i32, i32* %t0, i64 1
@@ -502,78 +509,174 @@
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
-; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
-; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
-; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i32 2
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i32 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i32 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i32 2
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i32 3
+; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>*
+; SSE-NEXT:    [[TMP35:%.*]] = load <4 x float>, <4 x float>* [[TMP34]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>*
+; SSE-NEXT:    [[TMP38:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i32 0
+; SSE-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP35]], i32 3
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP44]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = extractelement <4 x float> [[TMP35]], i32 0
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP46]], i32 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP40]], i32 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i32 0
+; SSE-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP38]], i32 3
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP50]], i32 1
+; SSE-NEXT:    [[TMP52:%.*]] = extractelement <4 x float> [[TMP38]], i32 0
+; SSE-NEXT:    [[TMP53:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP52]], i32 2
+; SSE-NEXT:    [[TMP54:%.*]] = insertelement <4 x float> [[TMP53]], float [[TMP42]], i32 3
+; SSE-NEXT:    [[TMP55:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP54]]
+; SSE-NEXT:    [[TMP56:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
+; SSE-NEXT:    store <4 x float> [[TMP55]], <4 x float>* [[TMP56]], align 4, [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; AVX-NEXT:    [[TMP18:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
+; AVX-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; AVX-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP11]], i32 2
+; AVX-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 3
+; AVX-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP36]], i32 4
+; AVX-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP23]], i32 3
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP38]], i32 5
+; AVX-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP23]], i32 0
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP40]], i32 6
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP28]], i32 7
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP9]], i32 1
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP13]], i32 2
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP16]], i32 3
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP20]], i32 4
+; AVX-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP26]], i32 3
+; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 5
+; AVX-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP26]], i32 0
+; AVX-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 6
+; AVX-NEXT:    [[TMP52:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP30]], i32 7
+; AVX-NEXT:    [[TMP53:%.*]] = fdiv <8 x float> [[TMP42]], [[TMP52]]
+; AVX-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX-NEXT:    store <8 x float> [[TMP53]], <8 x float>* [[TMP54]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; AVX2-NEXT:    [[TMP18:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP19]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
+; AVX2-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[TMP22]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; AVX2-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP27]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, [[TBAA0]]
+; AVX2-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[TMP11]], i32 2
+; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 3
+; AVX2-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP36]], i32 4
+; AVX2-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[TMP23]], i32 3
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP38]], i32 5
+; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP23]], i32 0
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP40]], i32 6
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP28]], i32 7
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP9]], i32 1
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP13]], i32 2
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP16]], i32 3
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP20]], i32 4
+; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <4 x float> [[TMP26]], i32 3
+; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 5
+; AVX2-NEXT:    [[TMP50:%.*]] = extractelement <4 x float> [[TMP26]], i32 0
+; AVX2-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP50]], i32 6
+; AVX2-NEXT:    [[TMP52:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP30]], i32 7
+; AVX2-NEXT:    [[TMP53:%.*]] = fdiv <8 x float> [[TMP42]], [[TMP52]]
+; AVX2-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX2-NEXT:    store <8 x float> [[TMP53]], <8 x float>* [[TMP54]], align 4, [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
@@ -4,17 +4,10 @@
 
 define dso_local <4 x float> @foo(<4 x i32> %0) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP6]] to float
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP7]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = sitofp i32 [[TMP9]] to float
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP10]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[SHUFFLE]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = extractelement <4 x i32> %0, i32 1
   %3 = sitofp i32 %2 to float
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -88,8 +88,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
@@ -137,8 +137,8 @@
 ; CHECK-LABEL: @fcmp_lt(
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -12,31 +12,31 @@
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T:%.*]] = select i1 undef, i16 undef, i16 15
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[T]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
-; CHECK-NEXT:    [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE10]], <i32 15, i32 31, i32 47, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i16> [[SHUFFLE]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> <i32 poison, i32 63>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], poison
+; CHECK-NEXT:    [[SHUFFLE11:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE11]], <i32 15, i32 31, i32 47, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef
 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP10]], undef
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP10]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = select i1 [[OP_EXTRA4]], i32 [[OP_EXTRA3]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[OP_EXTRA6]], i32 [[OP_EXTRA5]], i32 undef
-; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = icmp slt i32 [[OP_EXTRA7]], undef
-; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = select i1 [[OP_EXTRA8]], i32 [[OP_EXTRA7]], i32 undef
-; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i32> poison, [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], poison
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE1]], <i32 -49, i32 -33, i32 -33, i32 -17>
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP9]], undef
+; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP9]], i32 undef
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = icmp slt i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA3]], i32 [[OP_EXTRA2]], i32 undef
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = select i1 [[OP_EXTRA5]], i32 [[OP_EXTRA4]], i32 undef
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef
+; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = select i1 [[OP_EXTRA7]], i32 [[OP_EXTRA6]], i32 undef
+; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = icmp slt i32 [[OP_EXTRA8]], undef
+; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = select i1 [[OP_EXTRA9]], i32 [[OP_EXTRA8]], i32 undef
+; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA10]]
 ; CHECK-NEXT:    unreachable
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -19,63 +19,52 @@
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
-; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
-; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
-; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <16 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP11]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP15]], i32 6
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP17]], i32 7
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 9
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 10
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 11
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 13
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 14
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP3]], i32 15
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i8> [[TMP34]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP35]], <16 x i8>* [[TMP36]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx -slp-min-non-power2-values-size=2 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.9.0"
@@ -23,41 +23,39 @@
 define float @foo(float* nocapture readonly %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TMP2]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00
-; CHECK-NEXT:    [[ADD4]] = fadd float [[R_030]], [[MUL]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00
-; CHECK-NEXT:    [[ADD9]] = fadd float [[G_031]], [[MUL8]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00
-; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float poison>
+; CHECK-NEXT:    [[TMP14]] = fadd <4 x float> [[TMP4]], [[TMP13]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.for.body_crit_edge:
 ; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]]
-; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP18]]
 ; CHECK-NEXT:    ret float [[ADD17]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -10,18 +10,10 @@
 define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-LABEL: @slp_schedule_bundle(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>)
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -15,9 +15,9 @@
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], undef
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> undef, <4 x i32> [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], poison
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> poison, <4 x i32> [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> poison, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8
@@ -66,7 +66,7 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[SHRINK_SHUFFLE:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ poison, [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
@@ -78,7 +78,10 @@
 ; CHECK-NEXT:    [[ARRAYIDX11_6:%.*]] = getelementptr inbounds i16, i16* undef, i32 6
 ; CHECK-NEXT:    [[ARRAYIDX11_7:%.*]] = getelementptr inbounds i16, i16* undef, i32 7
 ; CHECK-NEXT:    store <8 x i16> [[SHUFFLE]], <8 x i16>* undef, align 2
-; CHECK-NEXT:    [[SHRINK_SHUFFLE]] = shufflevector <8 x i16> [[SHUFFLE]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i16> [[SHUFFLE]], i32 4
+; CHECK-NEXT:    [[TMP4]] = insertelement <2 x i16> [[TMP2]], i16 [[TMP3]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -23,7 +23,7 @@
 ; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
 ; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
 ; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
 ; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1
 ; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
@@ -8,248 +8,240 @@
 define void @n() local_unnamed_addr #0 {
 ; CHECK-LABEL: @n(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 0, i64 1) to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 1), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 2), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 3), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 0), align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 1), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 2), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 3), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 0), align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 1), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 2), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 3), align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 0), align 16
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 1), align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 2), align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 3), align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 0), align 16
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 1), align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 2), align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 3), align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 0), align 16
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 1), align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 2), align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 3), align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 0), align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 1), align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 2), align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 3), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x [4 x i32]]* @k to <8 x i32>*), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 2), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 1, i64 3), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 0), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 1), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 2, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 0), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 1), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 2), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 3, i64 3), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 0), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 1), align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 2), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 4, i64 3), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 0), align 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 1), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 2), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 5, i64 3), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 0), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 1), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 2), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 6, i64 3), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 0), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 1), align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 2), align 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([8 x [4 x i32]], [8 x [4 x i32]]* @k, i64 0, i64 7, i64 3), align 4
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[SPEC_SELECT8_3_7:%.*]], [[FOR_COND]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP29]], -183
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP30]], [[TMP0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp slt i32 [[SUB]], 0
-; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[NEG]], i32 [[SUB]]
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> poison, i32 [[TMP30]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP30]], i32 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP30]], i32 3
-; CHECK-NEXT:    [[TMP37:%.*]] = sub <4 x i32> [[TMP36]], [[TMP1]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt <4 x i32> [[TMP37]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP37]]
-; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP38]], <4 x i32> [[TMP39]], <4 x i32> [[TMP37]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]])
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[TMP41]], [[TMP32]]
-; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP32]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP43]], [[B_0]]
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP43]], i32 [[B_0]]
-; CHECK-NEXT:    [[SUB_1_1:%.*]] = sub i32 [[TMP30]], [[TMP2]]
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[SUB_1_1]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP28]], -183
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> poison, i32 [[TMP29]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP30]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = sub <8 x i32> [[SHUFFLE]], [[TMP0]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i32> [[TMP31]], <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP33:%.*]] = sub nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP32]], <8 x i32> [[TMP33]], <8 x i32> [[TMP31]]
+; CHECK-NEXT:    [[REDUCTION_NORMALIZATION:%.*]] = shufflevector <8 x i32> [[TMP34]], <8 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> [[REDUCTION_NORMALIZATION]])
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP35]], [[B_0]]
+; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP35]], i32 [[B_0]]
+; CHECK-NEXT:    [[SUB_1_1:%.*]] = sub i32 [[TMP29]], [[TMP1]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp slt i32 [[SUB_1_1]], 0
 ; CHECK-NEXT:    [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]]
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[NEG_1_1]], i32 [[SUB_1_1]]
-; CHECK-NEXT:    [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP45]], [[OP_EXTRA1]]
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[NEG_1_1]], i32 [[SUB_1_1]]
+; CHECK-NEXT:    [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP37]], [[OP_EXTRA1]]
 ; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[CMP12_1_1]], [[OP_EXTRA]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP45]], i32 [[OP_EXTRA1]]
-; CHECK-NEXT:    [[SUB_2_1:%.*]] = sub i32 [[TMP30]], [[TMP3]]
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp slt i32 [[SUB_2_1]], 0
+; CHECK-NEXT:    [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP37]], i32 [[OP_EXTRA1]]
+; CHECK-NEXT:    [[SUB_2_1:%.*]] = sub i32 [[TMP29]], [[TMP2]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt i32 [[SUB_2_1]], 0
 ; CHECK-NEXT:    [[NEG_2_1:%.*]] = sub nsw i32 0, [[SUB_2_1]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[NEG_2_1]], i32 [[SUB_2_1]]
-; CHECK-NEXT:    [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP47]], [[SPEC_SELECT8_1_1]]
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[NEG_2_1]], i32 [[SUB_2_1]]
+; CHECK-NEXT:    [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP39]], [[SPEC_SELECT8_1_1]]
 ; CHECK-NEXT:    [[NARROW34:%.*]] = or i1 [[CMP12_2_1]], [[NARROW]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP47]], i32 [[SPEC_SELECT8_1_1]]
-; CHECK-NEXT:    [[SUB_3_1:%.*]] = sub i32 [[TMP30]], [[TMP4]]
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[SUB_3_1]], 0
+; CHECK-NEXT:    [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP39]], i32 [[SPEC_SELECT8_1_1]]
+; CHECK-NEXT:    [[SUB_3_1:%.*]] = sub i32 [[TMP29]], [[TMP3]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp slt i32 [[SUB_3_1]], 0
 ; CHECK-NEXT:    [[NEG_3_1:%.*]] = sub nsw i32 0, [[SUB_3_1]]
-; CHECK-NEXT:    [[TMP49:%.*]] = select i1 [[TMP48]], i32 [[NEG_3_1]], i32 [[SUB_3_1]]
-; CHECK-NEXT:    [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP49]], [[SPEC_SELECT8_2_1]]
+; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[NEG_3_1]], i32 [[SUB_3_1]]
+; CHECK-NEXT:    [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP41]], [[SPEC_SELECT8_2_1]]
 ; CHECK-NEXT:    [[NARROW35:%.*]] = or i1 [[CMP12_3_1]], [[NARROW34]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_1:%.*]] = zext i1 [[NARROW35]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP49]], i32 [[SPEC_SELECT8_2_1]]
-; CHECK-NEXT:    [[SUB_222:%.*]] = sub i32 [[TMP30]], [[TMP5]]
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp slt i32 [[SUB_222]], 0
+; CHECK-NEXT:    [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP41]], i32 [[SPEC_SELECT8_2_1]]
+; CHECK-NEXT:    [[SUB_222:%.*]] = sub i32 [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[SUB_222]], 0
 ; CHECK-NEXT:    [[NEG_223:%.*]] = sub nsw i32 0, [[SUB_222]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[NEG_223]], i32 [[SUB_222]]
-; CHECK-NEXT:    [[CMP12_224:%.*]] = icmp slt i32 [[TMP51]], [[SPEC_SELECT8_3_1]]
-; CHECK-NEXT:    [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP51]], i32 [[SPEC_SELECT8_3_1]]
-; CHECK-NEXT:    [[SUB_1_2:%.*]] = sub i32 [[TMP30]], [[TMP6]]
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp slt i32 [[SUB_1_2]], 0
+; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[NEG_223]], i32 [[SUB_222]]
+; CHECK-NEXT:    [[CMP12_224:%.*]] = icmp slt i32 [[TMP43]], [[SPEC_SELECT8_3_1]]
+; CHECK-NEXT:    [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP43]], i32 [[SPEC_SELECT8_3_1]]
+; CHECK-NEXT:    [[SUB_1_2:%.*]] = sub i32 [[TMP29]], [[TMP5]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[SUB_1_2]], 0
 ; CHECK-NEXT:    [[NEG_1_2:%.*]] = sub nsw i32 0, [[SUB_1_2]]
-; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[NEG_1_2]], i32 [[SUB_1_2]]
-; CHECK-NEXT:    [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP53]], [[SPEC_SELECT8_226]]
-; CHECK-NEXT:    [[TMP54:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP53]], i32 [[SPEC_SELECT8_226]]
-; CHECK-NEXT:    [[SUB_2_2:%.*]] = sub i32 [[TMP30]], [[TMP7]]
-; CHECK-NEXT:    [[TMP55:%.*]] = icmp slt i32 [[SUB_2_2]], 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[NEG_1_2]], i32 [[SUB_1_2]]
+; CHECK-NEXT:    [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP45]], [[SPEC_SELECT8_226]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP45]], i32 [[SPEC_SELECT8_226]]
+; CHECK-NEXT:    [[SUB_2_2:%.*]] = sub i32 [[TMP29]], [[TMP6]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp slt i32 [[SUB_2_2]], 0
 ; CHECK-NEXT:    [[NEG_2_2:%.*]] = sub nsw i32 0, [[SUB_2_2]]
-; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[NEG_2_2]], i32 [[SUB_2_2]]
-; CHECK-NEXT:    [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP56]], [[SPEC_SELECT8_1_2]]
-; CHECK-NEXT:    [[TMP57:%.*]] = or i1 [[CMP12_2_2]], [[TMP54]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP56]], i32 [[SPEC_SELECT8_1_2]]
-; CHECK-NEXT:    [[SUB_3_2:%.*]] = sub i32 [[TMP30]], [[TMP8]]
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp slt i32 [[SUB_3_2]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[NEG_2_2]], i32 [[SUB_2_2]]
+; CHECK-NEXT:    [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP48]], [[SPEC_SELECT8_1_2]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or i1 [[CMP12_2_2]], [[TMP46]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP48]], i32 [[SPEC_SELECT8_1_2]]
+; CHECK-NEXT:    [[SUB_3_2:%.*]] = sub i32 [[TMP29]], [[TMP7]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp slt i32 [[SUB_3_2]], 0
 ; CHECK-NEXT:    [[NEG_3_2:%.*]] = sub nsw i32 0, [[SUB_3_2]]
-; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[NEG_3_2]], i32 [[SUB_3_2]]
-; CHECK-NEXT:    [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP59]], [[SPEC_SELECT8_2_2]]
-; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[CMP12_3_2]], [[TMP57]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP60]], i32 2, i32 [[SPEC_SELECT_3_1]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP59]], i32 [[SPEC_SELECT8_2_2]]
-; CHECK-NEXT:    [[SUB_328:%.*]] = sub i32 [[TMP30]], [[TMP9]]
-; CHECK-NEXT:    [[TMP61:%.*]] = icmp slt i32 [[SUB_328]], 0
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[NEG_3_2]], i32 [[SUB_3_2]]
+; CHECK-NEXT:    [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP51]], [[SPEC_SELECT8_2_2]]
+; CHECK-NEXT:    [[TMP52:%.*]] = or i1 [[CMP12_3_2]], [[TMP49]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP52]], i32 2, i32 [[SPEC_SELECT_3_1]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP51]], i32 [[SPEC_SELECT8_2_2]]
+; CHECK-NEXT:    [[SUB_328:%.*]] = sub i32 [[TMP29]], [[TMP8]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp slt i32 [[SUB_328]], 0
 ; CHECK-NEXT:    [[NEG_329:%.*]] = sub nsw i32 0, [[SUB_328]]
-; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[NEG_329]], i32 [[SUB_328]]
-; CHECK-NEXT:    [[CMP12_330:%.*]] = icmp slt i32 [[TMP62]], [[SPEC_SELECT8_3_2]]
-; CHECK-NEXT:    [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP62]], i32 [[SPEC_SELECT8_3_2]]
-; CHECK-NEXT:    [[SUB_1_3:%.*]] = sub i32 [[TMP30]], [[TMP10]]
-; CHECK-NEXT:    [[TMP63:%.*]] = icmp slt i32 [[SUB_1_3]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[NEG_329]], i32 [[SUB_328]]
+; CHECK-NEXT:    [[CMP12_330:%.*]] = icmp slt i32 [[TMP54]], [[SPEC_SELECT8_3_2]]
+; CHECK-NEXT:    [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP54]], i32 [[SPEC_SELECT8_3_2]]
+; CHECK-NEXT:    [[SUB_1_3:%.*]] = sub i32 [[TMP29]], [[TMP9]]
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp slt i32 [[SUB_1_3]], 0
 ; CHECK-NEXT:    [[NEG_1_3:%.*]] = sub nsw i32 0, [[SUB_1_3]]
-; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP63]], i32 [[NEG_1_3]], i32 [[SUB_1_3]]
-; CHECK-NEXT:    [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP64]], [[SPEC_SELECT8_332]]
-; CHECK-NEXT:    [[TMP65:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP64]], i32 [[SPEC_SELECT8_332]]
-; CHECK-NEXT:    [[SUB_2_3:%.*]] = sub i32 [[TMP30]], [[TMP11]]
-; CHECK-NEXT:    [[TMP66:%.*]] = icmp slt i32 [[SUB_2_3]], 0
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[NEG_1_3]], i32 [[SUB_1_3]]
+; CHECK-NEXT:    [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP56]], [[SPEC_SELECT8_332]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP56]], i32 [[SPEC_SELECT8_332]]
+; CHECK-NEXT:    [[SUB_2_3:%.*]] = sub i32 [[TMP29]], [[TMP10]]
+; CHECK-NEXT:    [[TMP58:%.*]] = icmp slt i32 [[SUB_2_3]], 0
 ; CHECK-NEXT:    [[NEG_2_3:%.*]] = sub nsw i32 0, [[SUB_2_3]]
-; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[NEG_2_3]], i32 [[SUB_2_3]]
-; CHECK-NEXT:    [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP67]], [[SPEC_SELECT8_1_3]]
-; CHECK-NEXT:    [[TMP68:%.*]] = or i1 [[CMP12_2_3]], [[TMP65]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP67]], i32 [[SPEC_SELECT8_1_3]]
-; CHECK-NEXT:    [[SUB_3_3:%.*]] = sub i32 [[TMP30]], [[TMP12]]
-; CHECK-NEXT:    [[TMP69:%.*]] = icmp slt i32 [[SUB_3_3]], 0
+; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[NEG_2_3]], i32 [[SUB_2_3]]
+; CHECK-NEXT:    [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP59]], [[SPEC_SELECT8_1_3]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[CMP12_2_3]], [[TMP57]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP59]], i32 [[SPEC_SELECT8_1_3]]
+; CHECK-NEXT:    [[SUB_3_3:%.*]] = sub i32 [[TMP29]], [[TMP11]]
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp slt i32 [[SUB_3_3]], 0
 ; CHECK-NEXT:    [[NEG_3_3:%.*]] = sub nsw i32 0, [[SUB_3_3]]
-; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP69]], i32 [[NEG_3_3]], i32 [[SUB_3_3]]
-; CHECK-NEXT:    [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP70]], [[SPEC_SELECT8_2_3]]
-; CHECK-NEXT:    [[TMP71:%.*]] = or i1 [[CMP12_3_3]], [[TMP68]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP71]], i32 3, i32 [[SPEC_SELECT_3_2]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP70]], i32 [[SPEC_SELECT8_2_3]]
-; CHECK-NEXT:    [[SUB_4:%.*]] = sub i32 [[TMP30]], [[TMP13]]
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp slt i32 [[SUB_4]], 0
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[NEG_3_3]], i32 [[SUB_3_3]]
+; CHECK-NEXT:    [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP62]], [[SPEC_SELECT8_2_3]]
+; CHECK-NEXT:    [[TMP63:%.*]] = or i1 [[CMP12_3_3]], [[TMP60]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP63]], i32 3, i32 [[SPEC_SELECT_3_2]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP62]], i32 [[SPEC_SELECT8_2_3]]
+; CHECK-NEXT:    [[SUB_4:%.*]] = sub i32 [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp slt i32 [[SUB_4]], 0
 ; CHECK-NEXT:    [[NEG_4:%.*]] = sub nsw i32 0, [[SUB_4]]
-; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP72]], i32 [[NEG_4]], i32 [[SUB_4]]
-; CHECK-NEXT:    [[CMP12_4:%.*]] = icmp slt i32 [[TMP73]], [[SPEC_SELECT8_3_3]]
-; CHECK-NEXT:    [[SPEC_SELECT8_4:%.*]] = select i1 [[CMP12_4]], i32 [[TMP73]], i32 [[SPEC_SELECT8_3_3]]
-; CHECK-NEXT:    [[SUB_1_4:%.*]] = sub i32 [[TMP30]], [[TMP14]]
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp slt i32 [[SUB_1_4]], 0
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[NEG_4]], i32 [[SUB_4]]
+; CHECK-NEXT:    [[CMP12_4:%.*]] = icmp slt i32 [[TMP65]], [[SPEC_SELECT8_3_3]]
+; CHECK-NEXT:    [[SPEC_SELECT8_4:%.*]] = select i1 [[CMP12_4]], i32 [[TMP65]], i32 [[SPEC_SELECT8_3_3]]
+; CHECK-NEXT:    [[SUB_1_4:%.*]] = sub i32 [[TMP29]], [[TMP13]]
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp slt i32 [[SUB_1_4]], 0
 ; CHECK-NEXT:    [[NEG_1_4:%.*]] = sub nsw i32 0, [[SUB_1_4]]
-; CHECK-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[NEG_1_4]], i32 [[SUB_1_4]]
-; CHECK-NEXT:    [[CMP12_1_4:%.*]] = icmp slt i32 [[TMP75]], [[SPEC_SELECT8_4]]
-; CHECK-NEXT:    [[TMP76:%.*]] = or i1 [[CMP12_1_4]], [[CMP12_4]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_4:%.*]] = select i1 [[CMP12_1_4]], i32 [[TMP75]], i32 [[SPEC_SELECT8_4]]
-; CHECK-NEXT:    [[SUB_2_4:%.*]] = sub i32 [[TMP30]], [[TMP15]]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp slt i32 [[SUB_2_4]], 0
+; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[NEG_1_4]], i32 [[SUB_1_4]]
+; CHECK-NEXT:    [[CMP12_1_4:%.*]] = icmp slt i32 [[TMP67]], [[SPEC_SELECT8_4]]
+; CHECK-NEXT:    [[TMP68:%.*]] = or i1 [[CMP12_1_4]], [[CMP12_4]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_4:%.*]] = select i1 [[CMP12_1_4]], i32 [[TMP67]], i32 [[SPEC_SELECT8_4]]
+; CHECK-NEXT:    [[SUB_2_4:%.*]] = sub i32 [[TMP29]], [[TMP14]]
+; CHECK-NEXT:    [[TMP69:%.*]] = icmp slt i32 [[SUB_2_4]], 0
 ; CHECK-NEXT:    [[NEG_2_4:%.*]] = sub nsw i32 0, [[SUB_2_4]]
-; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[NEG_2_4]], i32 [[SUB_2_4]]
-; CHECK-NEXT:    [[CMP12_2_4:%.*]] = icmp slt i32 [[TMP78]], [[SPEC_SELECT8_1_4]]
-; CHECK-NEXT:    [[TMP79:%.*]] = or i1 [[CMP12_2_4]], [[TMP76]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_4:%.*]] = select i1 [[CMP12_2_4]], i32 [[TMP78]], i32 [[SPEC_SELECT8_1_4]]
-; CHECK-NEXT:    [[SUB_3_4:%.*]] = sub i32 [[TMP30]], [[TMP16]]
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp slt i32 [[SUB_3_4]], 0
+; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP69]], i32 [[NEG_2_4]], i32 [[SUB_2_4]]
+; CHECK-NEXT:    [[CMP12_2_4:%.*]] = icmp slt i32 [[TMP70]], [[SPEC_SELECT8_1_4]]
+; CHECK-NEXT:    [[TMP71:%.*]] = or i1 [[CMP12_2_4]], [[TMP68]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_4:%.*]] = select i1 [[CMP12_2_4]], i32 [[TMP70]], i32 [[SPEC_SELECT8_1_4]]
+; CHECK-NEXT:    [[SUB_3_4:%.*]] = sub i32 [[TMP29]], [[TMP15]]
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp slt i32 [[SUB_3_4]], 0
 ; CHECK-NEXT:    [[NEG_3_4:%.*]] = sub nsw i32 0, [[SUB_3_4]]
-; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[NEG_3_4]], i32 [[SUB_3_4]]
-; CHECK-NEXT:    [[CMP12_3_4:%.*]] = icmp slt i32 [[TMP81]], [[SPEC_SELECT8_2_4]]
-; CHECK-NEXT:    [[TMP82:%.*]] = or i1 [[CMP12_3_4]], [[TMP79]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_4:%.*]] = select i1 [[TMP82]], i32 4, i32 [[SPEC_SELECT_3_3]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_4:%.*]] = select i1 [[CMP12_3_4]], i32 [[TMP81]], i32 [[SPEC_SELECT8_2_4]]
-; CHECK-NEXT:    [[SUB_5:%.*]] = sub i32 [[TMP30]], [[TMP17]]
-; CHECK-NEXT:    [[TMP83:%.*]] = icmp slt i32 [[SUB_5]], 0
+; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP72]], i32 [[NEG_3_4]], i32 [[SUB_3_4]]
+; CHECK-NEXT:    [[CMP12_3_4:%.*]] = icmp slt i32 [[TMP73]], [[SPEC_SELECT8_2_4]]
+; CHECK-NEXT:    [[TMP74:%.*]] = or i1 [[CMP12_3_4]], [[TMP71]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_4:%.*]] = select i1 [[TMP74]], i32 4, i32 [[SPEC_SELECT_3_3]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_4:%.*]] = select i1 [[CMP12_3_4]], i32 [[TMP73]], i32 [[SPEC_SELECT8_2_4]]
+; CHECK-NEXT:    [[SUB_5:%.*]] = sub i32 [[TMP29]], [[TMP16]]
+; CHECK-NEXT:    [[TMP75:%.*]] = icmp slt i32 [[SUB_5]], 0
 ; CHECK-NEXT:    [[NEG_5:%.*]] = sub nsw i32 0, [[SUB_5]]
-; CHECK-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[NEG_5]], i32 [[SUB_5]]
-; CHECK-NEXT:    [[CMP12_5:%.*]] = icmp slt i32 [[TMP84]], [[SPEC_SELECT8_3_4]]
-; CHECK-NEXT:    [[SPEC_SELECT8_5:%.*]] = select i1 [[CMP12_5]], i32 [[TMP84]], i32 [[SPEC_SELECT8_3_4]]
-; CHECK-NEXT:    [[SUB_1_5:%.*]] = sub i32 [[TMP30]], [[TMP18]]
-; CHECK-NEXT:    [[TMP85:%.*]] = icmp slt i32 [[SUB_1_5]], 0
+; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 [[NEG_5]], i32 [[SUB_5]]
+; CHECK-NEXT:    [[CMP12_5:%.*]] = icmp slt i32 [[TMP76]], [[SPEC_SELECT8_3_4]]
+; CHECK-NEXT:    [[SPEC_SELECT8_5:%.*]] = select i1 [[CMP12_5]], i32 [[TMP76]], i32 [[SPEC_SELECT8_3_4]]
+; CHECK-NEXT:    [[SUB_1_5:%.*]] = sub i32 [[TMP29]], [[TMP17]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp slt i32 [[SUB_1_5]], 0
 ; CHECK-NEXT:    [[NEG_1_5:%.*]] = sub nsw i32 0, [[SUB_1_5]]
-; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[NEG_1_5]], i32 [[SUB_1_5]]
-; CHECK-NEXT:    [[CMP12_1_5:%.*]] = icmp slt i32 [[TMP86]], [[SPEC_SELECT8_5]]
-; CHECK-NEXT:    [[TMP87:%.*]] = or i1 [[CMP12_1_5]], [[CMP12_5]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_5:%.*]] = select i1 [[CMP12_1_5]], i32 [[TMP86]], i32 [[SPEC_SELECT8_5]]
-; CHECK-NEXT:    [[SUB_2_5:%.*]] = sub i32 [[TMP30]], [[TMP19]]
-; CHECK-NEXT:    [[TMP88:%.*]] = icmp slt i32 [[SUB_2_5]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[NEG_1_5]], i32 [[SUB_1_5]]
+; CHECK-NEXT:    [[CMP12_1_5:%.*]] = icmp slt i32 [[TMP78]], [[SPEC_SELECT8_5]]
+; CHECK-NEXT:    [[TMP79:%.*]] = or i1 [[CMP12_1_5]], [[CMP12_5]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_5:%.*]] = select i1 [[CMP12_1_5]], i32 [[TMP78]], i32 [[SPEC_SELECT8_5]]
+; CHECK-NEXT:    [[SUB_2_5:%.*]] = sub i32 [[TMP29]], [[TMP18]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp slt i32 [[SUB_2_5]], 0
 ; CHECK-NEXT:    [[NEG_2_5:%.*]] = sub nsw i32 0, [[SUB_2_5]]
-; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[NEG_2_5]], i32 [[SUB_2_5]]
-; CHECK-NEXT:    [[CMP12_2_5:%.*]] = icmp slt i32 [[TMP89]], [[SPEC_SELECT8_1_5]]
-; CHECK-NEXT:    [[TMP90:%.*]] = or i1 [[CMP12_2_5]], [[TMP87]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_5:%.*]] = select i1 [[CMP12_2_5]], i32 [[TMP89]], i32 [[SPEC_SELECT8_1_5]]
-; CHECK-NEXT:    [[SUB_3_5:%.*]] = sub i32 [[TMP30]], [[TMP20]]
-; CHECK-NEXT:    [[TMP91:%.*]] = icmp slt i32 [[SUB_3_5]], 0
+; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[NEG_2_5]], i32 [[SUB_2_5]]
+; CHECK-NEXT:    [[CMP12_2_5:%.*]] = icmp slt i32 [[TMP81]], [[SPEC_SELECT8_1_5]]
+; CHECK-NEXT:    [[TMP82:%.*]] = or i1 [[CMP12_2_5]], [[TMP79]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_5:%.*]] = select i1 [[CMP12_2_5]], i32 [[TMP81]], i32 [[SPEC_SELECT8_1_5]]
+; CHECK-NEXT:    [[SUB_3_5:%.*]] = sub i32 [[TMP29]], [[TMP19]]
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp slt i32 [[SUB_3_5]], 0
 ; CHECK-NEXT:    [[NEG_3_5:%.*]] = sub nsw i32 0, [[SUB_3_5]]
-; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[NEG_3_5]], i32 [[SUB_3_5]]
-; CHECK-NEXT:    [[CMP12_3_5:%.*]] = icmp slt i32 [[TMP92]], [[SPEC_SELECT8_2_5]]
-; CHECK-NEXT:    [[TMP93:%.*]] = or i1 [[CMP12_3_5]], [[TMP90]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_5:%.*]] = select i1 [[TMP93]], i32 5, i32 [[SPEC_SELECT_3_4]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_5:%.*]] = select i1 [[CMP12_3_5]], i32 [[TMP92]], i32 [[SPEC_SELECT8_2_5]]
-; CHECK-NEXT:    [[SUB_6:%.*]] = sub i32 [[TMP30]], [[TMP21]]
-; CHECK-NEXT:    [[TMP94:%.*]] = icmp slt i32 [[SUB_6]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[NEG_3_5]], i32 [[SUB_3_5]]
+; CHECK-NEXT:    [[CMP12_3_5:%.*]] = icmp slt i32 [[TMP84]], [[SPEC_SELECT8_2_5]]
+; CHECK-NEXT:    [[TMP85:%.*]] = or i1 [[CMP12_3_5]], [[TMP82]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_5:%.*]] = select i1 [[TMP85]], i32 5, i32 [[SPEC_SELECT_3_4]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_5:%.*]] = select i1 [[CMP12_3_5]], i32 [[TMP84]], i32 [[SPEC_SELECT8_2_5]]
+; CHECK-NEXT:    [[SUB_6:%.*]] = sub i32 [[TMP29]], [[TMP20]]
+; CHECK-NEXT:    [[TMP86:%.*]] = icmp slt i32 [[SUB_6]], 0
 ; CHECK-NEXT:    [[NEG_6:%.*]] = sub nsw i32 0, [[SUB_6]]
-; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[NEG_6]], i32 [[SUB_6]]
-; CHECK-NEXT:    [[CMP12_6:%.*]] = icmp slt i32 [[TMP95]], [[SPEC_SELECT8_3_5]]
-; CHECK-NEXT:    [[SPEC_SELECT8_6:%.*]] = select i1 [[CMP12_6]], i32 [[TMP95]], i32 [[SPEC_SELECT8_3_5]]
-; CHECK-NEXT:    [[SUB_1_6:%.*]] = sub i32 [[TMP30]], [[TMP22]]
-; CHECK-NEXT:    [[TMP96:%.*]] = icmp slt i32 [[SUB_1_6]], 0
+; CHECK-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[NEG_6]], i32 [[SUB_6]]
+; CHECK-NEXT:    [[CMP12_6:%.*]] = icmp slt i32 [[TMP87]], [[SPEC_SELECT8_3_5]]
+; CHECK-NEXT:    [[SPEC_SELECT8_6:%.*]] = select i1 [[CMP12_6]], i32 [[TMP87]], i32 [[SPEC_SELECT8_3_5]]
+; CHECK-NEXT:    [[SUB_1_6:%.*]] = sub i32 [[TMP29]], [[TMP21]]
+; CHECK-NEXT:    [[TMP88:%.*]] = icmp slt i32 [[SUB_1_6]], 0
 ; CHECK-NEXT:    [[NEG_1_6:%.*]] = sub nsw i32 0, [[SUB_1_6]]
-; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[NEG_1_6]], i32 [[SUB_1_6]]
-; CHECK-NEXT:    [[CMP12_1_6:%.*]] = icmp slt i32 [[TMP97]], [[SPEC_SELECT8_6]]
-; CHECK-NEXT:    [[TMP98:%.*]] = or i1 [[CMP12_1_6]], [[CMP12_6]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_6:%.*]] = select i1 [[CMP12_1_6]], i32 [[TMP97]], i32 [[SPEC_SELECT8_6]]
-; CHECK-NEXT:    [[SUB_2_6:%.*]] = sub i32 [[TMP30]], [[TMP23]]
-; CHECK-NEXT:    [[TMP99:%.*]] = icmp slt i32 [[SUB_2_6]], 0
+; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[NEG_1_6]], i32 [[SUB_1_6]]
+; CHECK-NEXT:    [[CMP12_1_6:%.*]] = icmp slt i32 [[TMP89]], [[SPEC_SELECT8_6]]
+; CHECK-NEXT:    [[TMP90:%.*]] = or i1 [[CMP12_1_6]], [[CMP12_6]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_6:%.*]] = select i1 [[CMP12_1_6]], i32 [[TMP89]], i32 [[SPEC_SELECT8_6]]
+; CHECK-NEXT:    [[SUB_2_6:%.*]] = sub i32 [[TMP29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP91:%.*]] = icmp slt i32 [[SUB_2_6]], 0
 ; CHECK-NEXT:    [[NEG_2_6:%.*]] = sub nsw i32 0, [[SUB_2_6]]
-; CHECK-NEXT:    [[TMP100:%.*]] = select i1 [[TMP99]], i32 [[NEG_2_6]], i32 [[SUB_2_6]]
-; CHECK-NEXT:    [[CMP12_2_6:%.*]] = icmp slt i32 [[TMP100]], [[SPEC_SELECT8_1_6]]
-; CHECK-NEXT:    [[TMP101:%.*]] = or i1 [[CMP12_2_6]], [[TMP98]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_6:%.*]] = select i1 [[CMP12_2_6]], i32 [[TMP100]], i32 [[SPEC_SELECT8_1_6]]
-; CHECK-NEXT:    [[SUB_3_6:%.*]] = sub i32 [[TMP30]], [[TMP24]]
-; CHECK-NEXT:    [[TMP102:%.*]] = icmp slt i32 [[SUB_3_6]], 0
+; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[NEG_2_6]], i32 [[SUB_2_6]]
+; CHECK-NEXT:    [[CMP12_2_6:%.*]] = icmp slt i32 [[TMP92]], [[SPEC_SELECT8_1_6]]
+; CHECK-NEXT:    [[TMP93:%.*]] = or i1 [[CMP12_2_6]], [[TMP90]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_6:%.*]] = select i1 [[CMP12_2_6]], i32 [[TMP92]], i32 [[SPEC_SELECT8_1_6]]
+; CHECK-NEXT:    [[SUB_3_6:%.*]] = sub i32 [[TMP29]], [[TMP23]]
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp slt i32 [[SUB_3_6]], 0
 ; CHECK-NEXT:    [[NEG_3_6:%.*]] = sub nsw i32 0, [[SUB_3_6]]
-; CHECK-NEXT:    [[TMP103:%.*]] = select i1 [[TMP102]], i32 [[NEG_3_6]], i32 [[SUB_3_6]]
-; CHECK-NEXT:    [[CMP12_3_6:%.*]] = icmp slt i32 [[TMP103]], [[SPEC_SELECT8_2_6]]
-; CHECK-NEXT:    [[TMP104:%.*]] = or i1 [[CMP12_3_6]], [[TMP101]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_6:%.*]] = select i1 [[TMP104]], i32 6, i32 [[SPEC_SELECT_3_5]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_6:%.*]] = select i1 [[CMP12_3_6]], i32 [[TMP103]], i32 [[SPEC_SELECT8_2_6]]
-; CHECK-NEXT:    [[SUB_7:%.*]] = sub i32 [[TMP30]], [[TMP25]]
-; CHECK-NEXT:    [[TMP105:%.*]] = icmp slt i32 [[SUB_7]], 0
+; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[NEG_3_6]], i32 [[SUB_3_6]]
+; CHECK-NEXT:    [[CMP12_3_6:%.*]] = icmp slt i32 [[TMP95]], [[SPEC_SELECT8_2_6]]
+; CHECK-NEXT:    [[TMP96:%.*]] = or i1 [[CMP12_3_6]], [[TMP93]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_6:%.*]] = select i1 [[TMP96]], i32 6, i32 [[SPEC_SELECT_3_5]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_6:%.*]] = select i1 [[CMP12_3_6]], i32 [[TMP95]], i32 [[SPEC_SELECT8_2_6]]
+; CHECK-NEXT:    [[SUB_7:%.*]] = sub i32 [[TMP29]], [[TMP24]]
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp slt i32 [[SUB_7]], 0
 ; CHECK-NEXT:    [[NEG_7:%.*]] = sub nsw i32 0, [[SUB_7]]
-; CHECK-NEXT:    [[TMP106:%.*]] = select i1 [[TMP105]], i32 [[NEG_7]], i32 [[SUB_7]]
-; CHECK-NEXT:    [[CMP12_7:%.*]] = icmp slt i32 [[TMP106]], [[SPEC_SELECT8_3_6]]
-; CHECK-NEXT:    [[SPEC_SELECT8_7:%.*]] = select i1 [[CMP12_7]], i32 [[TMP106]], i32 [[SPEC_SELECT8_3_6]]
-; CHECK-NEXT:    [[SUB_1_7:%.*]] = sub i32 [[TMP30]], [[TMP26]]
-; CHECK-NEXT:    [[TMP107:%.*]] = icmp slt i32 [[SUB_1_7]], 0
+; CHECK-NEXT:    [[TMP98:%.*]] = select i1 [[TMP97]], i32 [[NEG_7]], i32 [[SUB_7]]
+; CHECK-NEXT:    [[CMP12_7:%.*]] = icmp slt i32 [[TMP98]], [[SPEC_SELECT8_3_6]]
+; CHECK-NEXT:    [[SPEC_SELECT8_7:%.*]] = select i1 [[CMP12_7]], i32 [[TMP98]], i32 [[SPEC_SELECT8_3_6]]
+; CHECK-NEXT:    [[SUB_1_7:%.*]] = sub i32 [[TMP29]], [[TMP25]]
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp slt i32 [[SUB_1_7]], 0
 ; CHECK-NEXT:    [[NEG_1_7:%.*]] = sub nsw i32 0, [[SUB_1_7]]
-; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP107]], i32 [[NEG_1_7]], i32 [[SUB_1_7]]
-; CHECK-NEXT:    [[CMP12_1_7:%.*]] = icmp slt i32 [[TMP108]], [[SPEC_SELECT8_7]]
-; CHECK-NEXT:    [[TMP109:%.*]] = or i1 [[CMP12_1_7]], [[CMP12_7]]
-; CHECK-NEXT:    [[SPEC_SELECT8_1_7:%.*]] = select i1 [[CMP12_1_7]], i32 [[TMP108]], i32 [[SPEC_SELECT8_7]]
-; CHECK-NEXT:    [[SUB_2_7:%.*]] = sub i32 [[TMP30]], [[TMP27]]
-; CHECK-NEXT:    [[TMP110:%.*]] = icmp slt i32 [[SUB_2_7]], 0
+; CHECK-NEXT:    [[TMP100:%.*]] = select i1 [[TMP99]], i32 [[NEG_1_7]], i32 [[SUB_1_7]]
+; CHECK-NEXT:    [[CMP12_1_7:%.*]] = icmp slt i32 [[TMP100]], [[SPEC_SELECT8_7]]
+; CHECK-NEXT:    [[TMP101:%.*]] = or i1 [[CMP12_1_7]], [[CMP12_7]]
+; CHECK-NEXT:    [[SPEC_SELECT8_1_7:%.*]] = select i1 [[CMP12_1_7]], i32 [[TMP100]], i32 [[SPEC_SELECT8_7]]
+; CHECK-NEXT:    [[SUB_2_7:%.*]] = sub i32 [[TMP29]], [[TMP26]]
+; CHECK-NEXT:    [[TMP102:%.*]] = icmp slt i32 [[SUB_2_7]], 0
 ; CHECK-NEXT:    [[NEG_2_7:%.*]] = sub nsw i32 0, [[SUB_2_7]]
-; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP110]], i32 [[NEG_2_7]], i32 [[SUB_2_7]]
-; CHECK-NEXT:    [[CMP12_2_7:%.*]] = icmp slt i32 [[TMP111]], [[SPEC_SELECT8_1_7]]
-; CHECK-NEXT:    [[TMP112:%.*]] = or i1 [[CMP12_2_7]], [[TMP109]]
-; CHECK-NEXT:    [[SPEC_SELECT8_2_7:%.*]] = select i1 [[CMP12_2_7]], i32 [[TMP111]], i32 [[SPEC_SELECT8_1_7]]
-; CHECK-NEXT:    [[SUB_3_7:%.*]] = sub i32 [[TMP30]], [[TMP28]]
-; CHECK-NEXT:    [[TMP113:%.*]] = icmp slt i32 [[SUB_3_7]], 0
+; CHECK-NEXT:    [[TMP103:%.*]] = select i1 [[TMP102]], i32 [[NEG_2_7]], i32 [[SUB_2_7]]
+; CHECK-NEXT:    [[CMP12_2_7:%.*]] = icmp slt i32 [[TMP103]], [[SPEC_SELECT8_1_7]]
+; CHECK-NEXT:    [[TMP104:%.*]] = or i1 [[CMP12_2_7]], [[TMP101]]
+; CHECK-NEXT:    [[SPEC_SELECT8_2_7:%.*]] = select i1 [[CMP12_2_7]], i32 [[TMP103]], i32 [[SPEC_SELECT8_1_7]]
+; CHECK-NEXT:    [[SUB_3_7:%.*]] = sub i32 [[TMP29]], [[TMP27]]
+; CHECK-NEXT:    [[TMP105:%.*]] = icmp slt i32 [[SUB_3_7]], 0
 ; CHECK-NEXT:    [[NEG_3_7:%.*]] = sub nsw i32 0, [[SUB_3_7]]
-; CHECK-NEXT:    [[TMP114:%.*]] = select i1 [[TMP113]], i32 [[NEG_3_7]], i32 [[SUB_3_7]]
-; CHECK-NEXT:    [[CMP12_3_7:%.*]] = icmp slt i32 [[TMP114]], [[SPEC_SELECT8_2_7]]
-; CHECK-NEXT:    [[TMP115:%.*]] = or i1 [[CMP12_3_7]], [[TMP112]]
-; CHECK-NEXT:    [[SPEC_SELECT_3_7:%.*]] = select i1 [[TMP115]], i32 7, i32 [[SPEC_SELECT_3_6]]
-; CHECK-NEXT:    [[SPEC_SELECT8_3_7]] = select i1 [[CMP12_3_7]], i32 [[TMP114]], i32 [[SPEC_SELECT8_2_7]]
+; CHECK-NEXT:    [[TMP106:%.*]] = select i1 [[TMP105]], i32 [[NEG_3_7]], i32 [[SUB_3_7]]
+; CHECK-NEXT:    [[CMP12_3_7:%.*]] = icmp slt i32 [[TMP106]], [[SPEC_SELECT8_2_7]]
+; CHECK-NEXT:    [[TMP107:%.*]] = or i1 [[CMP12_3_7]], [[TMP104]]
+; CHECK-NEXT:    [[SPEC_SELECT_3_7:%.*]] = select i1 [[TMP107]], i32 7, i32 [[SPEC_SELECT_3_6]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3_7]] = select i1 [[CMP12_3_7]], i32 [[TMP106]], i32 [[SPEC_SELECT8_2_7]]
 ; CHECK-NEXT:    [[K:%.*]] = getelementptr inbounds [366 x i32], [366 x i32]* @l, i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[SPEC_SELECT_3_7]], i32* [[K]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
@@ -11,31 +11,31 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float poison, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float poison, i32 1
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ poison, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ poison, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], poison
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
 ; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> poison, [[TMP6]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double poison, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> poison, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], poison
 ; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -11,31 +11,31 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float poison, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float poison, i32 1
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ poison, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ poison, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], poison
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
 ; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> poison, [[TMP6]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double poison, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> poison, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], poison
 ; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -27,12 +27,9 @@
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -41,16 +38,31 @@
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[T28]], i32 0
-; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T29]], i32 3
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 6270, i32 poison, i32 -15137, i32 poison, i32 poison, i32 poison>, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <8 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 2
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 3
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP14]], i32 3
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP11]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 6
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[TMP15]], i32 6
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -27,12 +27,9 @@
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -41,16 +38,31 @@
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0
-; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T29]], i32 3
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 6270, i32 poison, i32 -15137, i32 poison, i32 poison, i32 poison>, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <8 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 2
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 3
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP14]], i32 3
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP11]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[SHUFFLE1]], i32 6
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[TMP15]], i32 6
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -7,15 +7,15 @@
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A7:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
@@ -57,15 +57,15 @@
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A6:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
@@ -111,15 +111,15 @@
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A4:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 3, i32 2, i32 3, i32 0, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
--- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -132,82 +132,58 @@
 ; MAX256-NEXT:  bb:
 ; MAX256-NEXT:    br label [[BB1:%.*]]
 ; MAX256:       bb1:
-; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> poison, half [[HVAL:%.*]], i32 0
-; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX256-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
-; MAX256-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 7
-; MAX256-NEXT:    [[TMP13:%.*]] = fmul <8 x float> [[SHUFFLE]], [[TMP12]]
-; MAX256-NEXT:    [[TMP14:%.*]] = fadd <8 x float> zeroinitializer, [[TMP13]]
-; MAX256-NEXT:    [[TMP15:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 3
-; MAX256-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 2
-; MAX256-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 1
-; MAX256-NEXT:    [[TMP18:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 0
-; MAX256-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i32 0
-; MAX256-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[TMP16]], i32 1
-; MAX256-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP17]], i32 2
-; MAX256-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[TMP18]], i32 3
-; MAX256-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP15]], i32 4
-; MAX256-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP16]], i32 5
-; MAX256-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP17]], i32 6
-; MAX256-NEXT:    [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i32 7
-; MAX256-NEXT:    [[TMP27:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP28:%.*]] = fadd <8 x float> zeroinitializer, [[TMP27]]
-; MAX256-NEXT:    [[TMP29:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP30:%.*]] = fadd <8 x float> zeroinitializer, [[TMP29]]
-; MAX256-NEXT:    [[TMP31:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP32:%.*]] = fadd <8 x float> zeroinitializer, [[TMP31]]
-; MAX256-NEXT:    [[TMP33:%.*]] = extractelement <8 x float> [[TMP14]], i32 0
-; MAX256-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP33]], i32 0
-; MAX256-NEXT:    [[TMP35:%.*]] = extractelement <8 x float> [[TMP14]], i32 1
-; MAX256-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP35]], i32 1
-; MAX256-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP39:%.*]] = extractelement <8 x float> [[TMP14]], i32 4
-; MAX256-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP39]], i32 4
-; MAX256-NEXT:    [[TMP41:%.*]] = extractelement <8 x float> [[TMP14]], i32 5
-; MAX256-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP41]], i32 5
-; MAX256-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[FVAL]], i32 7
-; MAX256-NEXT:    [[TMP45:%.*]] = extractelement <8 x float> [[TMP28]], i32 2
-; MAX256-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP45]], i32 2
-; MAX256-NEXT:    [[TMP47:%.*]] = extractelement <8 x float> [[TMP28]], i32 3
-; MAX256-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP47]], i32 3
-; MAX256-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP51:%.*]] = extractelement <8 x float> [[TMP28]], i32 6
-; MAX256-NEXT:    [[TMP52:%.*]] = insertelement <8 x float> [[TMP50]], float [[TMP51]], i32 6
-; MAX256-NEXT:    [[TMP53:%.*]] = extractelement <8 x float> [[TMP28]], i32 7
-; MAX256-NEXT:    [[TMP54:%.*]] = insertelement <8 x float> [[TMP52]], float [[TMP53]], i32 7
-; MAX256-NEXT:    [[TMP55:%.*]] = extractelement <8 x float> [[TMP30]], i32 2
-; MAX256-NEXT:    [[TMP56:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP55]], i32 2
-; MAX256-NEXT:    [[TMP57:%.*]] = extractelement <8 x float> [[TMP30]], i32 3
-; MAX256-NEXT:    [[TMP58:%.*]] = insertelement <8 x float> [[TMP56]], float [[TMP57]], i32 3
-; MAX256-NEXT:    [[TMP59:%.*]] = insertelement <8 x float> [[TMP58]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP60:%.*]] = insertelement <8 x float> [[TMP59]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP61:%.*]] = extractelement <8 x float> [[TMP30]], i32 6
-; MAX256-NEXT:    [[TMP62:%.*]] = insertelement <8 x float> [[TMP60]], float [[TMP61]], i32 6
-; MAX256-NEXT:    [[TMP63:%.*]] = extractelement <8 x float> [[TMP30]], i32 7
-; MAX256-NEXT:    [[TMP64:%.*]] = insertelement <8 x float> [[TMP62]], float [[TMP63]], i32 7
-; MAX256-NEXT:    [[TMP65:%.*]] = extractelement <8 x float> [[TMP32]], i32 2
-; MAX256-NEXT:    [[TMP66:%.*]] = insertelement <8 x float> [[TMP6]], float [[TMP65]], i32 2
-; MAX256-NEXT:    [[TMP67:%.*]] = extractelement <8 x float> [[TMP32]], i32 3
-; MAX256-NEXT:    [[TMP68:%.*]] = insertelement <8 x float> [[TMP66]], float [[TMP67]], i32 3
-; MAX256-NEXT:    [[TMP69:%.*]] = insertelement <8 x float> [[TMP68]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP70:%.*]] = insertelement <8 x float> [[TMP69]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP71:%.*]] = extractelement <8 x float> [[TMP32]], i32 6
-; MAX256-NEXT:    [[TMP72:%.*]] = insertelement <8 x float> [[TMP70]], float [[TMP71]], i32 6
-; MAX256-NEXT:    [[TMP73:%.*]] = extractelement <8 x float> [[TMP32]], i32 7
-; MAX256-NEXT:    [[TMP74:%.*]] = insertelement <8 x float> [[TMP72]], float [[TMP73]], i32 7
+; MAX256-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
+; MAX256-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
+; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> [[TMP0]], float [[I]], i32 1
+; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I]], i32 2
+; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I]], i32 3
+; MAX256-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I]], i32 4
+; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I]], i32 5
+; MAX256-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I]], i32 6
+; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I]], i32 7
+; MAX256-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
+; MAX256-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 1
+; MAX256-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 2
+; MAX256-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 3
+; MAX256-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 4
+; MAX256-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[FVAL]], i32 5
+; MAX256-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[FVAL]], i32 6
+; MAX256-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[FVAL]], i32 7
+; MAX256-NEXT:    [[TMP16:%.*]] = fmul <8 x float> [[TMP7]], [[TMP15]]
+; MAX256-NEXT:    [[TMP17:%.*]] = fadd <8 x float> zeroinitializer, [[TMP16]]
+; MAX256-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX256-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I3]], i32 1
+; MAX256-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[I3]], i32 2
+; MAX256-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I3]], i32 3
+; MAX256-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[I3]], i32 4
+; MAX256-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[I3]], i32 5
+; MAX256-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[I3]], i32 6
+; MAX256-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[I3]], i32 7
+; MAX256-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP25]], [[TMP15]]
+; MAX256-NEXT:    [[TMP27:%.*]] = fadd <8 x float> zeroinitializer, [[TMP26]]
+; MAX256-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX256-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[I6]], i32 1
+; MAX256-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP29]], float [[I6]], i32 2
+; MAX256-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[I6]], i32 3
+; MAX256-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[I6]], i32 4
+; MAX256-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[I6]], i32 5
+; MAX256-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[I6]], i32 6
+; MAX256-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[I6]], i32 7
+; MAX256-NEXT:    [[TMP36:%.*]] = fmul <8 x float> [[TMP35]], [[TMP15]]
+; MAX256-NEXT:    [[TMP37:%.*]] = fadd <8 x float> zeroinitializer, [[TMP36]]
+; MAX256-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX256-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[I9]], i32 1
+; MAX256-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[I9]], i32 2
+; MAX256-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[I9]], i32 3
+; MAX256-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[I9]], i32 4
+; MAX256-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[I9]], i32 5
+; MAX256-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[I9]], i32 6
+; MAX256-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[I9]], i32 7
+; MAX256-NEXT:    [[TMP46:%.*]] = fmul <8 x float> [[TMP45]], [[TMP15]]
+; MAX256-NEXT:    [[TMP47:%.*]] = fadd <8 x float> zeroinitializer, [[TMP46]]
 ; MAX256-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX256-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX256-NEXT:    i32 1, label [[BB3:%.*]]
@@ -216,188 +192,74 @@
 ; MAX256:       bb3:
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb4:
-; MAX256-NEXT:    [[TMP75:%.*]] = insertelement <8 x float> [[TMP34]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP76:%.*]] = insertelement <8 x float> [[TMP75]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP77:%.*]] = extractelement <8 x float> [[TMP14]], i32 3
-; MAX256-NEXT:    [[TMP78:%.*]] = insertelement <8 x float> [[TMP76]], float [[TMP77]], i32 3
-; MAX256-NEXT:    [[TMP79:%.*]] = insertelement <8 x float> [[TMP78]], float [[TMP39]], i32 4
-; MAX256-NEXT:    [[TMP80:%.*]] = insertelement <8 x float> [[TMP79]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP81:%.*]] = insertelement <8 x float> [[TMP80]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP82:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
-; MAX256-NEXT:    [[TMP83:%.*]] = insertelement <8 x float> [[TMP81]], float [[TMP82]], i32 7
-; MAX256-NEXT:    [[TMP84:%.*]] = extractelement <8 x float> [[TMP28]], i32 0
-; MAX256-NEXT:    [[TMP85:%.*]] = insertelement <8 x float> poison, float [[TMP84]], i32 0
-; MAX256-NEXT:    [[TMP86:%.*]] = insertelement <8 x float> [[TMP85]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP87:%.*]] = insertelement <8 x float> [[TMP86]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP88:%.*]] = insertelement <8 x float> [[TMP87]], float [[TMP47]], i32 3
-; MAX256-NEXT:    [[TMP89:%.*]] = extractelement <8 x float> [[TMP28]], i32 4
-; MAX256-NEXT:    [[TMP90:%.*]] = insertelement <8 x float> [[TMP88]], float [[TMP89]], i32 4
-; MAX256-NEXT:    [[TMP91:%.*]] = insertelement <8 x float> [[TMP90]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP92:%.*]] = insertelement <8 x float> [[TMP91]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP93:%.*]] = insertelement <8 x float> [[TMP92]], float [[TMP53]], i32 7
-; MAX256-NEXT:    [[TMP94:%.*]] = extractelement <8 x float> [[TMP30]], i32 0
-; MAX256-NEXT:    [[TMP95:%.*]] = insertelement <8 x float> poison, float [[TMP94]], i32 0
-; MAX256-NEXT:    [[TMP96:%.*]] = insertelement <8 x float> [[TMP95]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP97:%.*]] = insertelement <8 x float> [[TMP96]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP98:%.*]] = insertelement <8 x float> [[TMP97]], float [[TMP57]], i32 3
-; MAX256-NEXT:    [[TMP99:%.*]] = extractelement <8 x float> [[TMP30]], i32 4
-; MAX256-NEXT:    [[TMP100:%.*]] = insertelement <8 x float> [[TMP98]], float [[TMP99]], i32 4
-; MAX256-NEXT:    [[TMP101:%.*]] = insertelement <8 x float> [[TMP100]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP102:%.*]] = insertelement <8 x float> [[TMP101]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP103:%.*]] = insertelement <8 x float> [[TMP102]], float [[TMP63]], i32 7
-; MAX256-NEXT:    [[TMP104:%.*]] = extractelement <8 x float> [[TMP32]], i32 0
-; MAX256-NEXT:    [[TMP105:%.*]] = insertelement <8 x float> poison, float [[TMP104]], i32 0
-; MAX256-NEXT:    [[TMP106:%.*]] = insertelement <8 x float> [[TMP105]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP107:%.*]] = insertelement <8 x float> [[TMP106]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP108:%.*]] = insertelement <8 x float> [[TMP107]], float [[TMP67]], i32 3
-; MAX256-NEXT:    [[TMP109:%.*]] = extractelement <8 x float> [[TMP32]], i32 4
-; MAX256-NEXT:    [[TMP110:%.*]] = insertelement <8 x float> [[TMP108]], float [[TMP109]], i32 4
-; MAX256-NEXT:    [[TMP111:%.*]] = insertelement <8 x float> [[TMP110]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP112:%.*]] = insertelement <8 x float> [[TMP111]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP113:%.*]] = insertelement <8 x float> [[TMP112]], float [[TMP73]], i32 7
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb5:
-; MAX256-NEXT:    [[TMP114:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP35]], i32 1
-; MAX256-NEXT:    [[TMP115:%.*]] = insertelement <8 x float> [[TMP114]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP116:%.*]] = extractelement <8 x float> [[TMP14]], i32 3
-; MAX256-NEXT:    [[TMP117:%.*]] = insertelement <8 x float> [[TMP115]], float [[TMP116]], i32 3
-; MAX256-NEXT:    [[TMP118:%.*]] = insertelement <8 x float> [[TMP117]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP119:%.*]] = insertelement <8 x float> [[TMP118]], float [[TMP41]], i32 5
-; MAX256-NEXT:    [[TMP120:%.*]] = insertelement <8 x float> [[TMP119]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP121:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
-; MAX256-NEXT:    [[TMP122:%.*]] = insertelement <8 x float> [[TMP120]], float [[TMP121]], i32 7
-; MAX256-NEXT:    [[TMP123:%.*]] = extractelement <8 x float> [[TMP28]], i32 0
-; MAX256-NEXT:    [[TMP124:%.*]] = insertelement <8 x float> poison, float [[TMP123]], i32 0
-; MAX256-NEXT:    [[TMP125:%.*]] = insertelement <8 x float> [[TMP124]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP126:%.*]] = insertelement <8 x float> [[TMP125]], float [[TMP45]], i32 2
-; MAX256-NEXT:    [[TMP127:%.*]] = insertelement <8 x float> [[TMP126]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP128:%.*]] = extractelement <8 x float> [[TMP28]], i32 4
-; MAX256-NEXT:    [[TMP129:%.*]] = insertelement <8 x float> [[TMP127]], float [[TMP128]], i32 4
-; MAX256-NEXT:    [[TMP130:%.*]] = insertelement <8 x float> [[TMP129]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP131:%.*]] = insertelement <8 x float> [[TMP130]], float [[TMP51]], i32 6
-; MAX256-NEXT:    [[TMP132:%.*]] = insertelement <8 x float> [[TMP131]], float [[FVAL]], i32 7
-; MAX256-NEXT:    [[TMP133:%.*]] = extractelement <8 x float> [[TMP30]], i32 0
-; MAX256-NEXT:    [[TMP134:%.*]] = insertelement <8 x float> poison, float [[TMP133]], i32 0
-; MAX256-NEXT:    [[TMP135:%.*]] = insertelement <8 x float> [[TMP134]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP136:%.*]] = insertelement <8 x float> [[TMP135]], float [[TMP55]], i32 2
-; MAX256-NEXT:    [[TMP137:%.*]] = insertelement <8 x float> [[TMP136]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP138:%.*]] = extractelement <8 x float> [[TMP30]], i32 4
-; MAX256-NEXT:    [[TMP139:%.*]] = insertelement <8 x float> [[TMP137]], float [[TMP138]], i32 4
-; MAX256-NEXT:    [[TMP140:%.*]] = insertelement <8 x float> [[TMP139]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP141:%.*]] = insertelement <8 x float> [[TMP140]], float [[TMP61]], i32 6
-; MAX256-NEXT:    [[TMP142:%.*]] = insertelement <8 x float> [[TMP141]], float [[FVAL]], i32 7
-; MAX256-NEXT:    [[TMP143:%.*]] = extractelement <8 x float> [[TMP32]], i32 0
-; MAX256-NEXT:    [[TMP144:%.*]] = insertelement <8 x float> poison, float [[TMP143]], i32 0
-; MAX256-NEXT:    [[TMP145:%.*]] = insertelement <8 x float> [[TMP144]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP146:%.*]] = insertelement <8 x float> [[TMP145]], float [[TMP65]], i32 2
-; MAX256-NEXT:    [[TMP147:%.*]] = insertelement <8 x float> [[TMP146]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP148:%.*]] = extractelement <8 x float> [[TMP32]], i32 4
-; MAX256-NEXT:    [[TMP149:%.*]] = insertelement <8 x float> [[TMP147]], float [[TMP148]], i32 4
-; MAX256-NEXT:    [[TMP150:%.*]] = insertelement <8 x float> [[TMP149]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP151:%.*]] = insertelement <8 x float> [[TMP150]], float [[TMP71]], i32 6
-; MAX256-NEXT:    [[TMP152:%.*]] = insertelement <8 x float> [[TMP151]], float [[FVAL]], i32 7
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb2:
-; MAX256-NEXT:    [[TMP153:%.*]] = phi <8 x float> [ [[TMP14]], [[BB3]] ], [ [[TMP83]], [[BB4]] ], [ [[TMP122]], [[BB5]] ], [ [[TMP44]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6
-; MAX256-NEXT:    store float [[TMP157]], float* undef, align 4
+; MAX256-NEXT:    [[TMP48:%.*]] = phi <8 x float> [ [[TMP27]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP49:%.*]] = phi <8 x float> [ [[TMP37]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP37]], [[BB5]] ], [ [[TMP37]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP50:%.*]] = phi <8 x float> [ [[TMP47]], [[BB3]] ], [ [[TMP47]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP47]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP51:%.*]] = phi <8 x float> [ [[TMP17]], [[BB3]] ], [ [[TMP17]], [[BB4]] ], [ [[TMP17]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP49]], i32 7
+; MAX256-NEXT:    store float [[TMP52]], float* undef, align 4
 ; MAX256-NEXT:    ret void
 ;
 ; MAX1024-LABEL: @phi_float32(
 ; MAX1024-NEXT:  bb:
 ; MAX1024-NEXT:    br label [[BB1:%.*]]
 ; MAX1024:       bb1:
-; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> poison, half [[HVAL:%.*]], i32 0
-; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX1024-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
-; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <32 x float> poison, float [[FVAL:%.*]], i32 0
-; MAX1024-NEXT:    [[TMP6:%.*]] = insertelement <32 x float> [[TMP5]], float [[FVAL]], i32 1
-; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <32 x float> [[TMP6]], float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP8:%.*]] = insertelement <32 x float> [[TMP7]], float [[FVAL]], i32 3
-; MAX1024-NEXT:    [[TMP9:%.*]] = insertelement <32 x float> [[TMP8]], float [[FVAL]], i32 4
-; MAX1024-NEXT:    [[TMP10:%.*]] = insertelement <32 x float> [[TMP9]], float [[FVAL]], i32 5
-; MAX1024-NEXT:    [[TMP11:%.*]] = insertelement <32 x float> [[TMP10]], float [[FVAL]], i32 6
-; MAX1024-NEXT:    [[TMP12:%.*]] = insertelement <32 x float> [[TMP11]], float [[FVAL]], i32 7
-; MAX1024-NEXT:    [[TMP13:%.*]] = insertelement <32 x float> [[TMP12]], float [[FVAL]], i32 8
-; MAX1024-NEXT:    [[TMP14:%.*]] = insertelement <32 x float> [[TMP13]], float [[FVAL]], i32 9
-; MAX1024-NEXT:    [[TMP15:%.*]] = insertelement <32 x float> [[TMP14]], float [[FVAL]], i32 10
-; MAX1024-NEXT:    [[TMP16:%.*]] = insertelement <32 x float> [[TMP15]], float [[FVAL]], i32 11
-; MAX1024-NEXT:    [[TMP17:%.*]] = insertelement <32 x float> [[TMP16]], float [[FVAL]], i32 12
-; MAX1024-NEXT:    [[TMP18:%.*]] = insertelement <32 x float> [[TMP17]], float [[FVAL]], i32 13
-; MAX1024-NEXT:    [[TMP19:%.*]] = insertelement <32 x float> [[TMP18]], float [[FVAL]], i32 14
-; MAX1024-NEXT:    [[TMP20:%.*]] = insertelement <32 x float> [[TMP19]], float [[FVAL]], i32 15
-; MAX1024-NEXT:    [[TMP21:%.*]] = insertelement <32 x float> [[TMP20]], float [[FVAL]], i32 16
-; MAX1024-NEXT:    [[TMP22:%.*]] = insertelement <32 x float> [[TMP21]], float [[FVAL]], i32 17
-; MAX1024-NEXT:    [[TMP23:%.*]] = insertelement <32 x float> [[TMP22]], float [[FVAL]], i32 18
-; MAX1024-NEXT:    [[TMP24:%.*]] = insertelement <32 x float> [[TMP23]], float [[FVAL]], i32 19
-; MAX1024-NEXT:    [[TMP25:%.*]] = insertelement <32 x float> [[TMP24]], float [[FVAL]], i32 20
-; MAX1024-NEXT:    [[TMP26:%.*]] = insertelement <32 x float> [[TMP25]], float [[FVAL]], i32 21
-; MAX1024-NEXT:    [[TMP27:%.*]] = insertelement <32 x float> [[TMP26]], float [[FVAL]], i32 22
-; MAX1024-NEXT:    [[TMP28:%.*]] = insertelement <32 x float> [[TMP27]], float [[FVAL]], i32 23
-; MAX1024-NEXT:    [[TMP29:%.*]] = insertelement <32 x float> [[TMP28]], float [[FVAL]], i32 24
-; MAX1024-NEXT:    [[TMP30:%.*]] = insertelement <32 x float> [[TMP29]], float [[FVAL]], i32 25
-; MAX1024-NEXT:    [[TMP31:%.*]] = insertelement <32 x float> [[TMP30]], float [[FVAL]], i32 26
-; MAX1024-NEXT:    [[TMP32:%.*]] = insertelement <32 x float> [[TMP31]], float [[FVAL]], i32 27
-; MAX1024-NEXT:    [[TMP33:%.*]] = insertelement <32 x float> [[TMP32]], float [[FVAL]], i32 28
-; MAX1024-NEXT:    [[TMP34:%.*]] = insertelement <32 x float> [[TMP33]], float [[FVAL]], i32 29
-; MAX1024-NEXT:    [[TMP35:%.*]] = insertelement <32 x float> [[TMP34]], float [[FVAL]], i32 30
-; MAX1024-NEXT:    [[TMP36:%.*]] = insertelement <32 x float> [[TMP35]], float [[FVAL]], i32 31
-; MAX1024-NEXT:    [[TMP37:%.*]] = fmul <32 x float> [[SHUFFLE]], [[TMP36]]
-; MAX1024-NEXT:    [[TMP38:%.*]] = fadd <32 x float> zeroinitializer, [[TMP37]]
-; MAX1024-NEXT:    [[TMP39:%.*]] = extractelement <32 x float> [[TMP38]], i32 0
-; MAX1024-NEXT:    [[TMP40:%.*]] = insertelement <32 x float> poison, float [[TMP39]], i32 0
-; MAX1024-NEXT:    [[TMP41:%.*]] = extractelement <32 x float> [[TMP38]], i32 1
-; MAX1024-NEXT:    [[TMP42:%.*]] = insertelement <32 x float> [[TMP40]], float [[TMP41]], i32 1
-; MAX1024-NEXT:    [[TMP43:%.*]] = insertelement <32 x float> [[TMP42]], float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP44:%.*]] = insertelement <32 x float> [[TMP43]], float [[FVAL]], i32 3
-; MAX1024-NEXT:    [[TMP45:%.*]] = extractelement <32 x float> [[TMP38]], i32 4
-; MAX1024-NEXT:    [[TMP46:%.*]] = insertelement <32 x float> [[TMP44]], float [[TMP45]], i32 4
-; MAX1024-NEXT:    [[TMP47:%.*]] = extractelement <32 x float> [[TMP38]], i32 5
-; MAX1024-NEXT:    [[TMP48:%.*]] = insertelement <32 x float> [[TMP46]], float [[TMP47]], i32 5
-; MAX1024-NEXT:    [[TMP49:%.*]] = insertelement <32 x float> [[TMP48]], float [[FVAL]], i32 6
-; MAX1024-NEXT:    [[TMP50:%.*]] = insertelement <32 x float> [[TMP49]], float [[FVAL]], i32 7
-; MAX1024-NEXT:    [[TMP51:%.*]] = insertelement <32 x float> [[TMP50]], float [[FVAL]], i32 8
-; MAX1024-NEXT:    [[TMP52:%.*]] = insertelement <32 x float> [[TMP51]], float [[FVAL]], i32 9
-; MAX1024-NEXT:    [[TMP53:%.*]] = extractelement <32 x float> [[TMP38]], i32 10
-; MAX1024-NEXT:    [[TMP54:%.*]] = insertelement <32 x float> [[TMP52]], float [[TMP53]], i32 10
-; MAX1024-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP38]], i32 11
-; MAX1024-NEXT:    [[TMP56:%.*]] = insertelement <32 x float> [[TMP54]], float [[TMP55]], i32 11
-; MAX1024-NEXT:    [[TMP57:%.*]] = insertelement <32 x float> [[TMP56]], float [[FVAL]], i32 12
-; MAX1024-NEXT:    [[TMP58:%.*]] = insertelement <32 x float> [[TMP57]], float [[FVAL]], i32 13
-; MAX1024-NEXT:    [[TMP59:%.*]] = extractelement <32 x float> [[TMP38]], i32 14
-; MAX1024-NEXT:    [[TMP60:%.*]] = insertelement <32 x float> [[TMP58]], float [[TMP59]], i32 14
-; MAX1024-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP38]], i32 15
-; MAX1024-NEXT:    [[TMP62:%.*]] = insertelement <32 x float> [[TMP60]], float [[TMP61]], i32 15
-; MAX1024-NEXT:    [[TMP63:%.*]] = insertelement <32 x float> [[TMP62]], float [[FVAL]], i32 16
-; MAX1024-NEXT:    [[TMP64:%.*]] = insertelement <32 x float> [[TMP63]], float [[FVAL]], i32 17
-; MAX1024-NEXT:    [[TMP65:%.*]] = extractelement <32 x float> [[TMP38]], i32 18
-; MAX1024-NEXT:    [[TMP66:%.*]] = insertelement <32 x float> [[TMP64]], float [[TMP65]], i32 18
-; MAX1024-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP38]], i32 19
-; MAX1024-NEXT:    [[TMP68:%.*]] = insertelement <32 x float> [[TMP66]], float [[TMP67]], i32 19
-; MAX1024-NEXT:    [[TMP69:%.*]] = insertelement <32 x float> [[TMP68]], float [[FVAL]], i32 20
-; MAX1024-NEXT:    [[TMP70:%.*]] = insertelement <32 x float> [[TMP69]], float [[FVAL]], i32 21
-; MAX1024-NEXT:    [[TMP71:%.*]] = extractelement <32 x float> [[TMP38]], i32 22
-; MAX1024-NEXT:    [[TMP72:%.*]] = insertelement <32 x float> [[TMP70]], float [[TMP71]], i32 22
-; MAX1024-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP38]], i32 23
-; MAX1024-NEXT:    [[TMP74:%.*]] = insertelement <32 x float> [[TMP72]], float [[TMP73]], i32 23
-; MAX1024-NEXT:    [[TMP75:%.*]] = insertelement <32 x float> [[TMP74]], float [[FVAL]], i32 24
-; MAX1024-NEXT:    [[TMP76:%.*]] = insertelement <32 x float> [[TMP75]], float [[FVAL]], i32 25
-; MAX1024-NEXT:    [[TMP77:%.*]] = extractelement <32 x float> [[TMP38]], i32 26
-; MAX1024-NEXT:    [[TMP78:%.*]] = insertelement <32 x float> [[TMP76]], float [[TMP77]], i32 26
-; MAX1024-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP38]], i32 27
-; MAX1024-NEXT:    [[TMP80:%.*]] = insertelement <32 x float> [[TMP78]], float [[TMP79]], i32 27
-; MAX1024-NEXT:    [[TMP81:%.*]] = insertelement <32 x float> [[TMP80]], float [[FVAL]], i32 28
-; MAX1024-NEXT:    [[TMP82:%.*]] = insertelement <32 x float> [[TMP81]], float [[FVAL]], i32 29
-; MAX1024-NEXT:    [[TMP83:%.*]] = extractelement <32 x float> [[TMP38]], i32 30
-; MAX1024-NEXT:    [[TMP84:%.*]] = insertelement <32 x float> [[TMP82]], float [[TMP83]], i32 30
-; MAX1024-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP38]], i32 31
-; MAX1024-NEXT:    [[TMP86:%.*]] = insertelement <32 x float> [[TMP84]], float [[TMP85]], i32 31
+; MAX1024-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
+; MAX1024-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
+; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> [[TMP0]], float [[I]], i32 1
+; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I]], i32 2
+; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I]], i32 3
+; MAX1024-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I]], i32 4
+; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I]], i32 5
+; MAX1024-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I]], i32 6
+; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I]], i32 7
+; MAX1024-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
+; MAX1024-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 1
+; MAX1024-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 2
+; MAX1024-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 3
+; MAX1024-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 4
+; MAX1024-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[FVAL]], i32 5
+; MAX1024-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[FVAL]], i32 6
+; MAX1024-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[FVAL]], i32 7
+; MAX1024-NEXT:    [[TMP16:%.*]] = fmul <8 x float> [[TMP7]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP17:%.*]] = fadd <8 x float> zeroinitializer, [[TMP16]]
+; MAX1024-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX1024-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I3]], i32 1
+; MAX1024-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[I3]], i32 2
+; MAX1024-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I3]], i32 3
+; MAX1024-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[I3]], i32 4
+; MAX1024-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[I3]], i32 5
+; MAX1024-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[I3]], i32 6
+; MAX1024-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[I3]], i32 7
+; MAX1024-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP25]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP27:%.*]] = fadd <8 x float> zeroinitializer, [[TMP26]]
+; MAX1024-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX1024-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[I6]], i32 1
+; MAX1024-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP29]], float [[I6]], i32 2
+; MAX1024-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[I6]], i32 3
+; MAX1024-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[I6]], i32 4
+; MAX1024-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[I6]], i32 5
+; MAX1024-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[I6]], i32 6
+; MAX1024-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[I6]], i32 7
+; MAX1024-NEXT:    [[TMP36:%.*]] = fmul <8 x float> [[TMP35]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP37:%.*]] = fadd <8 x float> zeroinitializer, [[TMP36]]
+; MAX1024-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX1024-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[I9]], i32 1
+; MAX1024-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[I9]], i32 2
+; MAX1024-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[I9]], i32 3
+; MAX1024-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[I9]], i32 4
+; MAX1024-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[I9]], i32 5
+; MAX1024-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[I9]], i32 6
+; MAX1024-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[I9]], i32 7
+; MAX1024-NEXT:    [[TMP46:%.*]] = fmul <8 x float> [[TMP45]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP47:%.*]] = fadd <8 x float> zeroinitializer, [[TMP46]]
 ; MAX1024-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX1024-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX1024-NEXT:    i32 1, label [[BB3:%.*]]
@@ -406,91 +268,16 @@
 ; MAX1024:       bb3:
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb4:
-; MAX1024-NEXT:    [[TMP87:%.*]] = insertelement <32 x float> [[TMP40]], float [[FVAL]], i32 1
-; MAX1024-NEXT:    [[TMP88:%.*]] = insertelement <32 x float> [[TMP87]], float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP89:%.*]] = extractelement <32 x float> [[TMP38]], i32 3
-; MAX1024-NEXT:    [[TMP90:%.*]] = insertelement <32 x float> [[TMP88]], float [[TMP89]], i32 3
-; MAX1024-NEXT:    [[TMP91:%.*]] = insertelement <32 x float> [[TMP90]], float [[TMP45]], i32 4
-; MAX1024-NEXT:    [[TMP92:%.*]] = insertelement <32 x float> [[TMP91]], float [[FVAL]], i32 5
-; MAX1024-NEXT:    [[TMP93:%.*]] = insertelement <32 x float> [[TMP92]], float [[FVAL]], i32 6
-; MAX1024-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP38]], i32 7
-; MAX1024-NEXT:    [[TMP95:%.*]] = insertelement <32 x float> [[TMP93]], float [[TMP94]], i32 7
-; MAX1024-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[TMP38]], i32 8
-; MAX1024-NEXT:    [[TMP97:%.*]] = insertelement <32 x float> [[TMP95]], float [[TMP96]], i32 8
-; MAX1024-NEXT:    [[TMP98:%.*]] = insertelement <32 x float> [[TMP97]], float [[FVAL]], i32 9
-; MAX1024-NEXT:    [[TMP99:%.*]] = insertelement <32 x float> [[TMP98]], float [[FVAL]], i32 10
-; MAX1024-NEXT:    [[TMP100:%.*]] = insertelement <32 x float> [[TMP99]], float [[TMP55]], i32 11
-; MAX1024-NEXT:    [[TMP101:%.*]] = extractelement <32 x float> [[TMP38]], i32 12
-; MAX1024-NEXT:    [[TMP102:%.*]] = insertelement <32 x float> [[TMP100]], float [[TMP101]], i32 12
-; MAX1024-NEXT:    [[TMP103:%.*]] = insertelement <32 x float> [[TMP102]], float [[FVAL]], i32 13
-; MAX1024-NEXT:    [[TMP104:%.*]] = insertelement <32 x float> [[TMP103]], float [[FVAL]], i32 14
-; MAX1024-NEXT:    [[TMP105:%.*]] = insertelement <32 x float> [[TMP104]], float [[TMP61]], i32 15
-; MAX1024-NEXT:    [[TMP106:%.*]] = extractelement <32 x float> [[TMP38]], i32 16
-; MAX1024-NEXT:    [[TMP107:%.*]] = insertelement <32 x float> [[TMP105]], float [[TMP106]], i32 16
-; MAX1024-NEXT:    [[TMP108:%.*]] = insertelement <32 x float> [[TMP107]], float [[FVAL]], i32 17
-; MAX1024-NEXT:    [[TMP109:%.*]] = insertelement <32 x float> [[TMP108]], float [[FVAL]], i32 18
-; MAX1024-NEXT:    [[TMP110:%.*]] = insertelement <32 x float> [[TMP109]], float [[TMP67]], i32 19
-; MAX1024-NEXT:    [[TMP111:%.*]] = extractelement <32 x float> [[TMP38]], i32 20
-; MAX1024-NEXT:    [[TMP112:%.*]] = insertelement <32 x float> [[TMP110]], float [[TMP111]], i32 20
-; MAX1024-NEXT:    [[TMP113:%.*]] = insertelement <32 x float> [[TMP112]], float [[FVAL]], i32 21
-; MAX1024-NEXT:    [[TMP114:%.*]] = insertelement <32 x float> [[TMP113]], float [[FVAL]], i32 22
-; MAX1024-NEXT:    [[TMP115:%.*]] = insertelement <32 x float> [[TMP114]], float [[TMP73]], i32 23
-; MAX1024-NEXT:    [[TMP116:%.*]] = extractelement <32 x float> [[TMP38]], i32 24
-; MAX1024-NEXT:    [[TMP117:%.*]] = insertelement <32 x float> [[TMP115]], float [[TMP116]], i32 24
-; MAX1024-NEXT:    [[TMP118:%.*]] = insertelement <32 x float> [[TMP117]], float [[FVAL]], i32 25
-; MAX1024-NEXT:    [[TMP119:%.*]] = insertelement <32 x float> [[TMP118]], float [[FVAL]], i32 26
-; MAX1024-NEXT:    [[TMP120:%.*]] = insertelement <32 x float> [[TMP119]], float [[TMP79]], i32 27
-; MAX1024-NEXT:    [[TMP121:%.*]] = extractelement <32 x float> [[TMP38]], i32 28
-; MAX1024-NEXT:    [[TMP122:%.*]] = insertelement <32 x float> [[TMP120]], float [[TMP121]], i32 28
-; MAX1024-NEXT:    [[TMP123:%.*]] = insertelement <32 x float> [[TMP122]], float [[FVAL]], i32 29
-; MAX1024-NEXT:    [[TMP124:%.*]] = insertelement <32 x float> [[TMP123]], float [[FVAL]], i32 30
-; MAX1024-NEXT:    [[TMP125:%.*]] = insertelement <32 x float> [[TMP124]], float [[TMP85]], i32 31
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb5:
-; MAX1024-NEXT:    [[TMP126:%.*]] = insertelement <32 x float> [[TMP5]], float [[TMP41]], i32 1
-; MAX1024-NEXT:    [[TMP127:%.*]] = insertelement <32 x float> [[TMP126]], float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP128:%.*]] = extractelement <32 x float> [[TMP38]], i32 3
-; MAX1024-NEXT:    [[TMP129:%.*]] = insertelement <32 x float> [[TMP127]], float [[TMP128]], i32 3
-; MAX1024-NEXT:    [[TMP130:%.*]] = insertelement <32 x float> [[TMP129]], float [[FVAL]], i32 4
-; MAX1024-NEXT:    [[TMP131:%.*]] = insertelement <32 x float> [[TMP130]], float [[TMP47]], i32 5
-; MAX1024-NEXT:    [[TMP132:%.*]] = insertelement <32 x float> [[TMP131]], float [[FVAL]], i32 6
-; MAX1024-NEXT:    [[TMP133:%.*]] = extractelement <32 x float> [[TMP38]], i32 7
-; MAX1024-NEXT:    [[TMP134:%.*]] = insertelement <32 x float> [[TMP132]], float [[TMP133]], i32 7
-; MAX1024-NEXT:    [[TMP135:%.*]] = extractelement <32 x float> [[TMP38]], i32 8
-; MAX1024-NEXT:    [[TMP136:%.*]] = insertelement <32 x float> [[TMP134]], float [[TMP135]], i32 8
-; MAX1024-NEXT:    [[TMP137:%.*]] = insertelement <32 x float> [[TMP136]], float [[FVAL]], i32 9
-; MAX1024-NEXT:    [[TMP138:%.*]] = insertelement <32 x float> [[TMP137]], float [[TMP53]], i32 10
-; MAX1024-NEXT:    [[TMP139:%.*]] = insertelement <32 x float> [[TMP138]], float [[FVAL]], i32 11
-; MAX1024-NEXT:    [[TMP140:%.*]] = extractelement <32 x float> [[TMP38]], i32 12
-; MAX1024-NEXT:    [[TMP141:%.*]] = insertelement <32 x float> [[TMP139]], float [[TMP140]], i32 12
-; MAX1024-NEXT:    [[TMP142:%.*]] = insertelement <32 x float> [[TMP141]], float [[FVAL]], i32 13
-; MAX1024-NEXT:    [[TMP143:%.*]] = insertelement <32 x float> [[TMP142]], float [[TMP59]], i32 14
-; MAX1024-NEXT:    [[TMP144:%.*]] = insertelement <32 x float> [[TMP143]], float [[FVAL]], i32 15
-; MAX1024-NEXT:    [[TMP145:%.*]] = extractelement <32 x float> [[TMP38]], i32 16
-; MAX1024-NEXT:    [[TMP146:%.*]] = insertelement <32 x float> [[TMP144]], float [[TMP145]], i32 16
-; MAX1024-NEXT:    [[TMP147:%.*]] = insertelement <32 x float> [[TMP146]], float [[FVAL]], i32 17
-; MAX1024-NEXT:    [[TMP148:%.*]] = insertelement <32 x float> [[TMP147]], float [[TMP65]], i32 18
-; MAX1024-NEXT:    [[TMP149:%.*]] = insertelement <32 x float> [[TMP148]], float [[FVAL]], i32 19
-; MAX1024-NEXT:    [[TMP150:%.*]] = extractelement <32 x float> [[TMP38]], i32 20
-; MAX1024-NEXT:    [[TMP151:%.*]] = insertelement <32 x float> [[TMP149]], float [[TMP150]], i32 20
-; MAX1024-NEXT:    [[TMP152:%.*]] = insertelement <32 x float> [[TMP151]], float [[FVAL]], i32 21
-; MAX1024-NEXT:    [[TMP153:%.*]] = insertelement <32 x float> [[TMP152]], float [[TMP71]], i32 22
-; MAX1024-NEXT:    [[TMP154:%.*]] = insertelement <32 x float> [[TMP153]], float [[FVAL]], i32 23
-; MAX1024-NEXT:    [[TMP155:%.*]] = extractelement <32 x float> [[TMP38]], i32 24
-; MAX1024-NEXT:    [[TMP156:%.*]] = insertelement <32 x float> [[TMP154]], float [[TMP155]], i32 24
-; MAX1024-NEXT:    [[TMP157:%.*]] = insertelement <32 x float> [[TMP156]], float [[FVAL]], i32 25
-; MAX1024-NEXT:    [[TMP158:%.*]] = insertelement <32 x float> [[TMP157]], float [[TMP77]], i32 26
-; MAX1024-NEXT:    [[TMP159:%.*]] = insertelement <32 x float> [[TMP158]], float [[FVAL]], i32 27
-; MAX1024-NEXT:    [[TMP160:%.*]] = extractelement <32 x float> [[TMP38]], i32 28
-; MAX1024-NEXT:    [[TMP161:%.*]] = insertelement <32 x float> [[TMP159]], float [[TMP160]], i32 28
-; MAX1024-NEXT:    [[TMP162:%.*]] = insertelement <32 x float> [[TMP161]], float [[FVAL]], i32 29
-; MAX1024-NEXT:    [[TMP163:%.*]] = insertelement <32 x float> [[TMP162]], float [[TMP83]], i32 30
-; MAX1024-NEXT:    [[TMP164:%.*]] = insertelement <32 x float> [[TMP163]], float [[FVAL]], i32 31
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb2:
-; MAX1024-NEXT:    [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]
-; MAX1024-NEXT:    [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30
-; MAX1024-NEXT:    store float [[TMP166]], float* undef, align 4
+; MAX1024-NEXT:    [[TMP48:%.*]] = phi <8 x float> [ [[TMP27]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP49:%.*]] = phi <8 x float> [ [[TMP37]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP37]], [[BB5]] ], [ [[TMP37]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP50:%.*]] = phi <8 x float> [ [[TMP47]], [[BB3]] ], [ [[TMP47]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP47]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP51:%.*]] = phi <8 x float> [ [[TMP17]], [[BB3]] ], [ [[TMP17]], [[BB4]] ], [ [[TMP17]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP49]], i32 7
+; MAX1024-NEXT:    store float [[TMP52]], float* undef, align 4
 ; MAX1024-NEXT:    ret void
 ;
 bb: