Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -119,6 +119,15 @@
 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
 
+static cl::opt<bool>
+    SLPThrottling("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<unsigned>
+    MaxCostsRecalculations("slp-throttling-budget", cl::init(32), cl::Hidden,
+                           cl::desc("Limit the total number of nodes for cost "
+                                    "recalculations during throttling"));
+
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
     cl::desc(
@@ -576,6 +585,8 @@
       MinVecRegSize = MinVectorRegSizeOption;
     else
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
   /// Vectorize the tree that starts with the elements in \p VL.
@@ -589,11 +600,61 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  InstructionCost
+  getExtractCost(SmallPtrSetImpl<TreeEntry *> &ProposedToGather) const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  InstructionCost getInsertCost(SmallPtrSetImpl<TreeEntry *> &ProposedToGather);
+
+  struct TECostComparator {
+    bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
+      return LHS->Cost > RHS->Cost;
+    }
+  };
+  using TEVectorizableSet = std::set<TreeEntry *, TECostComparator>;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> scalar form
+  ///      |
+  ///   <bundle 2> scalar form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> scalar form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details: https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf
+  bool findSubTree(TEVectorizableSet &Vec, InstructionCost UserCost);
+
+  /// Get raw summary of all elements of the tree.
+  InstructionCost getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost();
+  InstructionCost getTreeCost(bool TreeReduce = false,
+                              InstructionCost UserCost = 0);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -608,44 +669,35 @@
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
-  /// Clear the internal data structures that are created by 'buildTree'.
-  void deleteTree() {
-    VectorizableTree.clear();
-    ScalarToTreeEntry.clear();
-    MustGather.clear();
-    ExternalUses.clear();
-    NumOpsWantToKeepOrder.clear();
-    NumOpsWantToKeepOriginalOrder = 0;
-    for (auto &Iter : BlocksSchedules) {
-      BlockScheduling *BS = Iter.second.get();
-      BS->clear();
-    }
-    MinBWs.clear();
+  /// Save current tree for possible later vectorization.
+  void saveTree() {
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
-  unsigned getTreeSize() const { return VectorizableTree.size(); }
+  unsigned getTreeSize() const { return Tree->VectorizableTree.size(); }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
-    assert(llvm::all_of(
-               NumOpsWantToKeepOrder,
-               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
-                 return D.getFirst().size() ==
-                        VectorizableTree[0]->Scalars.size();
-               }) &&
+    assert(llvm::all_of(Tree->NumOpsWantToKeepOrder,
+                        [this](const decltype(
+                            Tree->NumOpsWantToKeepOrder)::value_type &D) {
+                          return D.getFirst().size() ==
+                                 Tree->VectorizableTree[0]->Scalars.size();
+                        }) &&
            "All orders must have the same size as number of instructions in "
            "tree node.");
     auto I = std::max_element(
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+        Tree->NumOpsWantToKeepOrder.begin(), Tree->NumOpsWantToKeepOrder.end(),
+        [](const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D1,
+           const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D2) {
           return D1.second < D2.second;
         });
-    if (I == NumOpsWantToKeepOrder.end() ||
-        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+    if (I == Tree->NumOpsWantToKeepOrder.end() ||
+        I->getSecond() <= Tree->NumOpsWantToKeepOriginalOrder)
       return None;
 
     return makeArrayRef(I->getFirst());
@@ -667,7 +719,7 @@
   void findRootOrder(OrdersType &Order) {
     // If the leaf has the same number of instructions to vectorize as the root
     // - order must be set already.
-    unsigned RootSize = VectorizableTree[0]->Scalars.size();
+    unsigned RootSize = Tree->VectorizableTree[0]->Scalars.size();
     if (Order.size() == RootSize)
       return;
     SmallVector<unsigned, 4> RealOrder(Order.size());
@@ -678,7 +730,7 @@
     // The leaf has less number of instructions - need to find the true order of
     // the root.
     // Scan the nodes starting from the leaf back to the root.
-    const TreeEntry *PNode = VectorizableTree.back().get();
+    const TreeEntry *PNode = Tree->VectorizableTree.back().get();
     SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
     SmallPtrSet<const TreeEntry *, 4> Visited;
     while (!Nodes.empty() && Order.size() != RootSize) {
@@ -783,6 +835,12 @@
   ///       may not be necessary.
   bool isLoadCombineCandidate() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree(SmallPtrSetImpl<TreeEntry *> &ProposedToGather);
+
+  /// Try partially vectorize the tree via throttling.
+  bool tryPartialVectorization();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1585,6 +1643,9 @@
     /// Does this entry require reordering?
     SmallVector<unsigned, 4> ReorderIndices;
 
+    /// Cost of this tree entry.
+    InstructionCost Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1597,6 +1658,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1801,9 +1865,10 @@
     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
-    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
-    TreeEntry *Last = VectorizableTree.back().get();
-    Last->Idx = VectorizableTree.size() - 1;
+    Tree->VectorizableTree.push_back(
+        std::make_unique<TreeEntry>(Tree->VectorizableTree));
+    TreeEntry *Last = Tree->VectorizableTree.back().get();
+    Last->Idx = Tree->VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->State = EntryState;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -1813,7 +1878,7 @@
     if (Last->State != TreeEntry::NeedToGather) {
       for (Value *V : VL) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
-        ScalarToTreeEntry[V] = Last;
+        Tree->ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
       unsigned Lane = 0;
@@ -1826,44 +1891,38 @@
       assert((!Bundle.getValue() || Lane == VL.size()) &&
              "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      Tree->MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      Tree->VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(
+          Last);
+    }
 
     return Last;
   }
 
-  /// -- Vectorization State --
-  /// Holds all of the tree entries.
-  TreeEntry::VecTreeTy VectorizableTree;
-
 #ifndef NDEBUG
   /// Debug printer.
   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
-    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
-      VectorizableTree[Id]->dump();
+    for (unsigned Id = 0, IdE = Tree->VectorizableTree.size(); Id != IdE;
+         ++Id) {
+      Tree->VectorizableTree[Id]->dump();
       dbgs() << "\n";
     }
   }
 #endif
 
-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) { return Tree->ScalarToTreeEntry.lookup(V); }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    return ScalarToTreeEntry.lookup(V);
+    return Tree->ScalarToTreeEntry.lookup(V);
   }
 
-  /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
-
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
-  /// A list of scalars that we found that we need to keep as scalars.
-  ValueSet MustGather;
-
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -1880,6 +1939,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  InstructionCost getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1924,12 +1986,6 @@
   /// eventually when the BoUpSLP is destructed.
   DenseMap<Instruction *, bool> DeletedInstructions;
 
-  /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User). External User
-  /// can be nullptr, it means that this Internal Scalar will be used later,
-  /// after vectorization.
-  UserList ExternalUses;
-
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2327,8 +2383,8 @@
     int SchedulingRegionID = 1;
   };
 
-  /// Attaches the BlockScheduling structures to basic blocks.
-  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
@@ -2361,13 +2417,98 @@
     }
   };
 
-  /// Contains orders of operations along with the number of bundles that have
-  /// operations in this order. It stores only those orders that require
-  /// reordering, if reordering is not required it is counted using \a
-  /// NumOpsWantToKeepOriginalOrder.
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
-  /// Number of bundles that do not require reordering.
-  unsigned NumOpsWantToKeepOriginalOrder = 0;
+  /// Tree state that created by 'buildTree'.
+  struct TreeState {
+    using TreeStateTy = SmallVector<std::unique_ptr<TreeState>, 2>;
+
+    /// -- Vectorization State --
+    /// Holds all of the tree entries.
+    TreeEntry::VecTreeTy VectorizableTree;
+
+    /// Maps a specific scalar to its tree entry.
+    SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
+
+    /// Tree entries that should not be vectorized due to throttling.
+    SmallVector<TreeEntry *, 2> RemovedOperations;
+
+    /// Contains orders of operations along with the number of bundles that have
+    /// operations in this order. It stores only those orders that require
+    /// reordering, if reordering is not required it is counted using \a
+    /// NumOpsWantToKeepOriginalOrder.
+    DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>
+        NumOpsWantToKeepOrder;
+    /// Number of bundles that do not require reordering.
+    unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+    /// A list of scalars that we found that we need to keep as scalars.
+    ValueSet MustGather;
+
+    /// Raw cost of all elemts in the tree.
+    InstructionCost RawTreeCost = 0;
+
+    /// Final cost of the tree.
+    InstructionCost TreeCost = 0;
+
+    /// Partail cost of the tree.
+    InstructionCost PartialCost = 0;
+
+    /// Indicate that no CallInst found in the tree and we don't need to
+    /// calculate spill cost.
+    bool NoCallInst = true;
+
+    /// True, if we have calucalte tree cost for the tree.
+    bool IsCostSumReady = false;
+
+    /// A list of values that need to extracted out of the tree.
+    /// This list holds pairs of (Internal Scalar : External User). External
+    /// User can be nullptr, it means that this Internal Scalar will be used
+    /// later, after vectorization.
+    UserList ExternalUses;
+
+    /// Current operations width to vectorize.
+    unsigned BundleWidth = 0;
+
+    /// Internal tree oprations proposed to be vectorized values use.
+    SmallDenseMap<Value *, UserList> InternalTreeUses;
+
+    /// Attaches the BlockScheduling structures to basic blocks.
+    MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+    /// A map of scalar integer values to the smallest bit width with which they
+    /// can legally be represented. The values map to (width, signed) pairs,
+    /// where "width" indicates the minimum bit width and "signed" is True if
+    /// the value must be signed-extended, rather than zero-extended, back to
+    /// its original width.
+    MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+
+    /// Clear the internal data structures that are created by 'buildTree'.
+    void deleteTree() {
+      VectorizableTree.clear();
+      ScalarToTreeEntry.clear();
+      MustGather.clear();
+      ExternalUses.clear();
+      InternalTreeUses.clear();
+      RemovedOperations.clear();
+      NumOpsWantToKeepOrder.clear();
+      NumOpsWantToKeepOriginalOrder = 0;
+      for (auto &Iter : BlocksSchedules) {
+        BlockScheduling *BS = Iter.second.get();
+        BS->clear();
+      }
+      MinBWs.clear();
+      NoCallInst = true;
+      RawTreeCost = 0;
+      TreeCost = 0;
+      PartialCost = 0;
+      IsCostSumReady = false;
+    }
+  };
+
+  // Previous trees that might be worth to vectorize.
+  TreeState::TreeStateTy BuiltTrees;
+
+  // Current tree that we consider.
+  TreeState *Tree = nullptr;
 
   // Analysis and block reference.
   Function *F;
@@ -2421,7 +2562,7 @@
   };
 
   static NodeRef getEntryNode(BoUpSLP &R) {
-    return R.VectorizableTree[0].get();
+    return R.Tree->VectorizableTree[0].get();
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
@@ -2449,14 +2590,14 @@
   };
 
   static nodes_iterator nodes_begin(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.begin());
+    return nodes_iterator(R->Tree->VectorizableTree.begin());
   }
 
   static nodes_iterator nodes_end(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.end());
+    return nodes_iterator(R->Tree->VectorizableTree.end());
   }
 
-  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+  static unsigned size(BoUpSLP *R) { return R->Tree->VectorizableTree.size(); }
 };
 
 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
@@ -2473,9 +2614,9 @@
     }
     for (auto V : Entry->Scalars) {
       OS << *V;
-      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
-            return EU.Scalar == V;
-          }))
+      if (llvm::any_of(
+              R->Tree->ExternalUses,
+              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
         OS << " <extract>";
       OS << "\n";
     }
@@ -2530,14 +2671,14 @@
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
-  deleteTree();
+  Tree->deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2559,7 +2700,7 @@
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+        Tree->ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
       }
       for (User *U : Scalar->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -2574,11 +2715,14 @@
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
+          Tree->InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
           if (UseScalar != U ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
-            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            assert((UseEntry->State == TreeEntry::Vectorize ||
+                    UseEntry->State == TreeEntry::ScatterVectorize) &&
+                   "Bad state");
             continue;
           }
         }
@@ -2589,7 +2733,7 @@
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+        Tree->ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
       }
     }
   }
@@ -2673,7 +2817,7 @@
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
   for (Value *V : VL) {
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+    if (Tree->MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
@@ -2717,7 +2861,7 @@
     VL = UniqueValues;
   }
 
-  auto &BSRef = BlocksSchedules[BB];
+  auto &BSRef = Tree->BlocksSchedules[BB];
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
 
@@ -2782,14 +2926,14 @@
       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOriginalOrder;
+        ++Tree->NumOpsWantToKeepOriginalOrder;
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       if (!CurrentOrder.empty()) {
@@ -2805,12 +2949,12 @@
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies, CurrentOrder);
         findRootOrder(CurrentOrder);
-        ++NumOpsWantToKeepOrder[CurrentOrder];
+        ++Tree->NumOpsWantToKeepOrder[CurrentOrder];
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -2875,7 +3019,7 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
@@ -2888,7 +3032,7 @@
             TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
             findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            ++Tree->NumOpsWantToKeepOrder[CurrentOrder];
           }
           return;
         }
@@ -3154,7 +3298,7 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
@@ -3168,7 +3312,7 @@
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
             findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
+            ++Tree->NumOpsWantToKeepOrder[CurrentOrder];
           }
           return;
         }
@@ -3307,6 +3451,60 @@
   }
 }
 
+void BoUpSLP::cutTree(SmallPtrSetImpl<TreeEntry *> &ProposedToGather) {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize &&
+        Entry->State != TreeEntry::ScatterVectorize)
+      continue;
+    // For all canceled operations we should consider the possibility of
+    // use by with non-canceled operations and for that, it requires
+    // to populate ExternalUser list with canceled elements.
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        TreeEntry *UserTE = getTreeEntry(U);
+        if (!UserTE || ProposedToGather.count(UserTE) == 0)
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        Tree->ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  }
+  // Canceling unprofitable elements.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      Tree->ScalarToTreeEntry.erase(V);
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+#endif
+    }
+  }
+}
+
+bool BoUpSLP::tryPartialVectorization() {
+  if (BuiltTrees.size() == 1)
+    return false;
+  Tree = BuiltTrees.front().get();
+  vectorizeTree();
+  LLVM_DEBUG(dbgs() << "SLP: Decided to partially vectorize tree with cost: "
+                    << Tree->PartialCost << ".\n");
+  return true;
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3407,7 +3605,7 @@
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return Tree->ScalarToTreeEntry.count(U) > 0;
          });
 }
 
@@ -3464,7 +3662,8 @@
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
-  if (E->State == TreeEntry::NeedToGather) {
+  if (E->State != TreeEntry::Vectorize &&
+      E->State != TreeEntry::ScatterVectorize) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
@@ -3483,7 +3682,7 @@
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+              !Tree->ScalarToTreeEntry.count(V)) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(V)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -3867,25 +4066,26 @@
 
 bool BoUpSLP::isFullyVectorizableTinyTree() const {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
-                    << VectorizableTree.size() << " is fully vectorizable .\n");
+                    << Tree->VectorizableTree.size()
+                    << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 &&
-      VectorizableTree[0]->State == TreeEntry::Vectorize)
+  if (Tree->VectorizableTree.size() == 1 &&
+      Tree->VectorizableTree[0]->State == TreeEntry::Vectorize)
     return true;
 
-  if (VectorizableTree.size() != 2)
+  if (Tree->VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
-  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
-      (allConstant(VectorizableTree[1]->Scalars) ||
-       isSplat(VectorizableTree[1]->Scalars)))
+  if (Tree->VectorizableTree[0]->State == TreeEntry::Vectorize &&
+      (allConstant(Tree->VectorizableTree[1]->Scalars) ||
+       isSplat(Tree->VectorizableTree[1]->Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
-      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+  if (Tree->VectorizableTree[0]->State != TreeEntry::Vectorize ||
+      Tree->VectorizableTree[1]->State != TreeEntry::Vectorize)
     return false;
 
   return true;
@@ -3929,16 +4129,16 @@
   if (RdxKind != RecurKind::Or)
     return false;
 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = Tree->VectorizableTree[0]->Scalars[0];
   return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
 }
 
 bool BoUpSLP::isLoadCombineCandidate() const {
   // Peek through a final sequence of stores and check if all operations are
   // likely to be load-combined.
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+  unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : Tree->VectorizableTree[0]->Scalars) {
     Value *X;
     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
         !isLoadCombineCandidateImpl(X, NumElts, TTI))
@@ -3950,7 +4150,7 @@
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
-  if (VectorizableTree.size() >= MinTreeSize)
+  if (Tree->VectorizableTree.size() >= MinTreeSize)
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -3958,8 +4158,8 @@
   if (isFullyVectorizableTinyTree())
     return false;
 
-  assert(VectorizableTree.empty()
-             ? ExternalUses.empty()
+  assert(Tree->VectorizableTree.empty()
+             ? Tree->ExternalUses.empty()
              : true && "We shouldn't have any external users");
 
   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
@@ -3967,12 +4167,11 @@
   return true;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
@@ -3985,7 +4184,7 @@
   // their order does not matter, as long as all instructions in a basic block
   // are grouped together. Using dominance ensures a deterministic order.
   SmallVector<Instruction *, 16> OrderedScalars;
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
@@ -4004,7 +4203,8 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) &&
+          Tree->ScalarToTreeEntry.find(&*J) != Tree->ScalarToTreeEntry.end())
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -4037,9 +4237,10 @@
     }
 
     if (NumCalls) {
+      Tree->NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
-        V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
+        V.push_back(FixedVectorType::get(II->getType(), Tree->BundleWidth));
       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
     }
 
@@ -4049,16 +4250,110 @@
   return Cost;
 }
 
-InstructionCost BoUpSLP::getTreeCost() {
-  InstructionCost Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+InstructionCost BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), Tree->BundleWidth);
+  Value *ScalarRoot = Tree->VectorizableTree.front()->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+InstructionCost
+BoUpSLP::getExtractCost(SmallPtrSetImpl<TreeEntry *> &ProposedToGather) const {
+  InstructionCost ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = Tree->InternalTreeUses.find(V);
+      if (It != Tree->InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : Tree->ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    ExtractCost += getExtractOperationCost(EU);
+  }
+  return ExtractCost;
+}
+
+InstructionCost
+BoUpSLP::getInsertCost(SmallPtrSetImpl<TreeEntry *> &ProposedToGather) {
+  InstructionCost InsertCost = 0;
+  for (TreeEntry *Entry : ProposedToGather) {
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (ProposedToGather.count(Entry) == 0)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      if (llvm::any_of(Inst->users(), [this](User *Op) {
+            return Tree->ScalarToTreeEntry.count(Op) > 0;
+          })) {
+        InsertCost += getEntryCost(Entry);
+        break;
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::findSubTree(TEVectorizableSet &Vec, InstructionCost UserCost) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Ignore any non-vectoriable entries, entries with low cost,
+    // or simple entries with just one operand or less.
+    if ((Entry->State != TreeEntry::Vectorize &&
+         Entry->State != TreeEntry::ScatterVectorize) ||
+        Entry->Cost <= 0 || !Entry->Idx)
+      continue;
+    Vec.insert(Entry);
+  }
+  InstructionCost Sum = 0;
+  for (TreeEntry *Entry : Vec)
+    Sum += Entry->Cost;
+  // Avoid reducing the tree if there is no potential room to reduce.
+  if ((Tree->TreeCost - UserCost - Sum) >= -SLPCostThreshold)
+    return false;
+
+  return (Vec.size() > 0);
+}
 
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+InstructionCost BoUpSLP::getRawTreeCost() {
+  InstructionCost CostSum = 0;
+  Tree->BundleWidth = Tree->VectorizableTree.front()->Scalars.size();
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << Tree->VectorizableTree.size() << ".\n");
 
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
     // For example, if we have a build vector (i.e., insertelement sequence)
@@ -4071,73 +4366,115 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+    if (TE.State != TreeEntry::Vectorize &&
+        TE.State != TreeEntry::ScatterVectorize &&
+        llvm::any_of(llvm::drop_begin(Tree->VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State != TreeEntry::Vectorize &&
+                              EntryPtr->State != TreeEntry::ScatterVectorize &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    InstructionCost C = getEntryCost(&TE);
-    Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+                      << ".\n");
+    CostSum += TE.Cost;
+    LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n");
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  InstructionCost ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
-
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State != TreeEntry::Vectorize &&
+          TE->State != TreeEntry::ScatterVectorize)
+        continue;
+      InstructionCost GatherCost = 0;
+      for (TreeEntry *Gather : TE->UseEntries)
+        if (Gather->State != TreeEntry::Vectorize &&
+            Gather->State != TreeEntry::ScatterVectorize)
+          GatherCost += Gather->Cost;
+      TE->Cost += GatherCost;
+    }
+  return CostSum;
+}
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = FixedVectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
-    }
+InstructionCost BoUpSLP::getTreeCost(bool TreeReduce,
+                                     InstructionCost UserCost) {
+  InstructionCost CostSum;
+  SmallPtrSet<TreeEntry *, 2> ProposedToGather;
+  if (!Tree->IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    Tree->RawTreeCost = CostSum;
+  } else {
+    CostSum = Tree->RawTreeCost;
   }
 
-  InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  InstructionCost ExtractCost = getExtractCost(ProposedToGather);
+  InstructionCost SpillCost = 0;
+  if (!Tree->NoCallInst || !Tree->IsCostSumReady)
+    SpillCost = getSpillCost();
+  assert((!Tree->NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+  if (!Tree->IsCostSumReady)
+    Tree->IsCostSumReady = true;
+  InstructionCost InsertCost = getInsertCost(ProposedToGather);
+  InstructionCost Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+  Tree->TreeCost = Cost;
 
 #ifndef NDEBUG
   SmallString<256> Str;
-  {
-    raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
+  if (TreeReduce && (Cost - UserCost) >= -SLPCostThreshold) {
+    TEVectorizableSet Vec;
+    if (!findSubTree(Vec, UserCost))
+      return Cost;
 
+    if (!Tree->NoCallInst && Vec.size() > MaxCostsRecalculations) {
+      std::set<llvm::slpvectorizer::BoUpSLP::TreeEntry *>::iterator It =
+          Vec.begin();
+      std::advance(It, (unsigned)MaxCostsRecalculations);
+      Vec.erase(It, Vec.end());
+    }
+
+    for (TreeEntry *T : Vec) {
+      ProposedToGather.insert(T);
+      T->State = TreeEntry::NeedToGather;
+      for (Value *V : T->Scalars) {
+        Tree->MustGather.insert(V);
+        Tree->ExternalUses.erase(
+            llvm::remove_if(Tree->ExternalUses,
+                            [V](ExternalUser &EU) { return EU.Scalar == V; }),
+            Tree->ExternalUses.end());
+      }
+      CostSum -= T->Cost;
+      ExtractCost = getExtractCost(ProposedToGather);
+      if (!Tree->NoCallInst)
+        SpillCost = getSpillCost();
+      assert((!Tree->NoCallInst || getSpillCost() == 0) &&
+             "Incorrect spill cost");
+      InsertCost = getInsertCost(ProposedToGather);
+      Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+      Tree->PartialCost = Cost - UserCost;
+      Tree->RemovedOperations.push_back(T);
+      if (Tree->PartialCost < -SLPCostThreshold) {
+        cutTree(ProposedToGather);
+        return Tree->PartialCost;
+      }
+    }
+  }
   return Cost;
 }
 
+
 InstructionCost
 BoUpSLP::getGatherCost(FixedVectorType *Ty,
                        const DenseSet<unsigned> &ShuffledIndices) const {
@@ -4208,9 +4545,9 @@
   // scheduled, and the last instruction is VL.back(). So we start with
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
-  if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+  if (Tree->BlocksSchedules.count(BB)) {
+    auto *Bundle = Tree->BlocksSchedules[BB]->getScheduleData(
+        E->isOneOf(E->Scalars.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -4275,7 +4612,7 @@
         FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
                                   find(Entry->ReuseShuffleIndices, FoundLane));
       }
-      ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
+      Tree->ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
     }
   }
 
@@ -4603,7 +4940,7 @@
         // to ExternalUses list to make sure that an extract will be generated
         // in the future.
         if (getTreeEntry(PO))
-          ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
+          Tree->ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
 
         NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       } else {
@@ -4655,7 +4992,7 @@
       // ExternalUses to make sure that an extract will be generated in the
       // future.
       if (getTreeEntry(ScalarPtr))
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+        Tree->ExternalUses.emplace_back(ScalarPtr, cast<User>(VecPtr), 0);
 
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses)
@@ -4757,7 +5094,7 @@
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
       if (ScalarArg && getTreeEntry(ScalarArg))
-        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+        Tree->ExternalUses.emplace_back(ScalarArg, cast<User>(V), 0);
 
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses)
@@ -4849,28 +5186,39 @@
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
-  for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+  for (auto &BSIter : Tree->BlocksSchedules) {
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!Tree->RemovedOperations.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+  auto *VectorRoot = vectorizeTree(Tree->VectorizableTree.front().get());
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize && !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+  auto *ScalarRoot = Tree->VectorizableTree[0]->Scalars[0];
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
-    auto BundleWidth = VectorizableTree[0]->Scalars.size();
+    Tree->BundleWidth = Tree->VectorizableTree[0]->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    auto *VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
-    VectorizableTree[0]->VectorizedValue = Trunc;
+    Tree->VectorizableTree[0]->VectorizedValue = Trunc;
   }
 
-  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+  LLVM_DEBUG(dbgs() << "SLP: Extracting " << Tree->ExternalUses.size()
                     << " values .\n");
 
   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
@@ -4884,7 +5232,7 @@
   };
 
   // Extract all of the elements with the external uses.
-  for (const auto &ExternalUse : ExternalUses) {
+  for (const auto &ExternalUse : Tree->ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
@@ -4964,7 +5312,7 @@
   }
 
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -4979,7 +5327,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && Tree->RemovedOperations.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -4997,7 +5347,13 @@
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  return VectorizableTree[0]->VectorizedValue;
+  // Erase all saved trees after vectorization, except the current.
+  llvm::erase_if(
+      BuiltTrees,
+      [this](std::unique_ptr<TreeState> &T) { return T.get() != Tree; }),
+      BuiltTrees.end();
+
+  return Tree->VectorizableTree[0]->VectorizedValue;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -5466,6 +5822,31 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (TreeEntry *Entry : Tree->RemovedOperations) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5680,11 +6061,11 @@
 void BoUpSLP::computeMinimumValueSizes() {
   // If there are no external uses, the expression tree must be rooted by a
   // store. We can't demote in-memory values, so there is nothing to do here.
-  if (ExternalUses.empty())
+  if (Tree->ExternalUses.empty())
     return;
 
   // We only attempt to truncate integer expressions.
-  auto &TreeRoot = VectorizableTree[0]->Scalars;
+  auto &TreeRoot = Tree->VectorizableTree[0]->Scalars;
   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
   if (!TreeRootIT)
     return;
@@ -5696,7 +6077,7 @@
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
-  for (auto &EU : ExternalUses)
+  for (auto &EU : Tree->ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
   if (!Expr.empty())
@@ -5705,7 +6086,7 @@
   // Collect the scalar values of the vectorizable expression. We will use this
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
-  for (auto &EntryPtr : VectorizableTree)
+  for (auto &EntryPtr : Tree->VectorizableTree)
     Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
@@ -5951,6 +6332,12 @@
                         << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
+
+    // Partially vectorize trees after all full vectorization is done,
+    // otherwise, we could prevent more profitable full vectorization with
+    // smaller vector sizes.
+    if (SLPThrottling)
+      Changed |= R.tryPartialVectorization();
   }
 
   if (Changed) {
@@ -6008,6 +6395,8 @@
     R.vectorizeTree();
     return true;
   }
+  if (SLPThrottling && R.getTreeCost(true) < -SLPCostThreshold)
+    R.saveTree();
 
   return false;
 }
@@ -6252,6 +6641,7 @@
 
       R.computeMinimumValueSizes();
       InstructionCost Cost = R.getTreeCost();
+      InstructionCost UserCost = 0;
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6281,7 +6671,6 @@
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        InstructionCost UserCost = 0;
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6308,6 +6697,8 @@
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else if (SLPThrottling && R.getTreeCost(true, UserCost) < -SLPCostThreshold) {
+        R.saveTree();
       }
     }
   }
Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -204,11 +204,15 @@
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
 ; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
 ; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0
+; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1
+; MAX-COST-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[P4]], i32 2
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[P6]], i32 3
+; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
 ; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
 ; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -220,19 +224,21 @@
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; MAX-COST-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; MAX-COST-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i32 0
+; MAX-COST-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; MAX-COST-NEXT:    [[TMP12:%.*]] = insertelement <4 x i1> [[TMP10]], i1 [[TMP11]], i32 1
+; MAX-COST-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; MAX-COST-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP13]], i32 2
+; MAX-COST-NEXT:    [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP8]], i32 3
+; MAX-COST-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
+; MAX-COST-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], [[P27]]
+; MAX-COST-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP19]], -5
 ; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
 ; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -116,12 +116,15 @@
 ; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
 ; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[T15]]
 ;
   %t0 = bitcast <4 x float>* %x to i64*
Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -116,12 +116,15 @@
 ; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
 ; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[T15]]
 ;
   %t0 = bitcast <4 x float>* %x to i64*
Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,34 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
-; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 2
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -175,43 +175,31 @@
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
+; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
@@ -356,19 +344,12 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
 ; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
@@ -378,22 +359,17 @@
 ; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
+; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -175,43 +175,31 @@
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
-; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
+; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
@@ -356,19 +344,12 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
 ; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
-; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
 ; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
-; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
 ; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
-; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
-; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
-; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
@@ -378,22 +359,17 @@
 ; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
-; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
-; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
-; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
-; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
-; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
-; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
-; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
-; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
-; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
+; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
+; AVX-NEXT:    store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -1,24 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-throttle=false < %s | FileCheck %s --check-prefixes=NO_THROTTLE
 
 define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 {
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
+; NO_THROTTLE-LABEL: @rftbsub(
+; NO_THROTTLE-NEXT:  entry:
+; NO_THROTTLE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
+; NO_THROTTLE-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; NO_THROTTLE-NEXT:    [[TMP1:%.*]] = or i64 2, 1
+; NO_THROTTLE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
+; NO_THROTTLE-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
+; NO_THROTTLE-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; NO_THROTTLE-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
+; NO_THROTTLE-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
+; NO_THROTTLE-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
+; NO_THROTTLE-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
+; NO_THROTTLE-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
+; NO_THROTTLE-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
+; NO_THROTTLE-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; NO_THROTTLE-NEXT:    unreachable
+;
 entry:
   %arrayidx6 = getelementptr inbounds double, double* %a, i64 2
   %0 = load double, double* %arrayidx6, align 8