Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -122,6 +122,15 @@
 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
 
+static cl::opt<bool>
+    SLPThrottling("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<unsigned>
+    MaxCostsRecalculations("slp-throttling-budget", cl::init(32), cl::Hidden,
+                           cl::desc("Limit the total number of nodes for cost "
+                                    "recalculations during throttling"));
+
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
     cl::desc(
@@ -565,6 +574,8 @@
       MinVecRegSize = MinVectorRegSizeOption;
     else
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
   /// Vectorize the tree that starts with the elements in \p VL.
@@ -578,7 +589,48 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost() const;
+  int getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost() const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  int getInsertCost();
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> scalar form
+  ///      |
+  ///   <bundle 2> scalar form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> scalar form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details: https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf
+  bool findSubTree(int UserCost = 0);
+
+  /// Get raw summary of all elements of the tree.
+  int getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
@@ -597,22 +649,13 @@
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
-  /// Clear the internal data structures that are created by 'buildTree'.
-  void deleteTree() {
-    VectorizableTree.clear();
-    ScalarToTreeEntry.clear();
-    MustGather.clear();
-    ExternalUses.clear();
-    NumOpsWantToKeepOrder.clear();
-    NumOpsWantToKeepOriginalOrder = 0;
-    for (auto &Iter : BlocksSchedules) {
-      BlockScheduling *BS = Iter.second.get();
-      BS->clear();
-    }
-    MinBWs.clear();
+  /// Save current tree for possible later vectorization.
+  void saveTree() {
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
-  unsigned getTreeSize() const { return VectorizableTree.size(); }
+  unsigned getTreeSize() const { return Tree->VectorizableTree.size(); }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
@@ -620,13 +663,13 @@
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
     auto I = std::max_element(
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+        Tree->NumOpsWantToKeepOrder.begin(), Tree->NumOpsWantToKeepOrder.end(),
+        [](const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D1,
+           const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D2) {
           return D1.second < D2.second;
         });
-    if (I == NumOpsWantToKeepOrder.end() ||
-        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+    if (I == Tree->NumOpsWantToKeepOrder.end() ||
+        I->getSecond() <= Tree->NumOpsWantToKeepOriginalOrder)
       return None;
 
     return makeArrayRef(I->getFirst());
@@ -683,6 +726,12 @@
   ///       may not be necessary.
   bool isLoadCombineCandidate() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree();
+
+  /// Try partially vectorize the tree via throttling.
+  bool tryPartialVectorization();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1465,7 +1514,7 @@
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
-    enum EntryState { Vectorize, NeedToGather };
+    enum EntryState { Vectorize, NeedToGather, ProposedToGather };
     EntryState State;
 
     /// Does this sequence require some shuffling?
@@ -1474,6 +1523,9 @@
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
 
+    /// Cost of this tree entry.
+    int Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1486,6 +1538,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1616,6 +1671,9 @@
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case ProposedToGather:
+        dbgs() << "ProposedToGather\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -1658,9 +1716,10 @@
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
     bool Vectorized = (bool)Bundle;
-    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
-    TreeEntry *Last = VectorizableTree.back().get();
-    Last->Idx = VectorizableTree.size() - 1;
+    Tree->VectorizableTree.push_back(
+        std::make_unique<TreeEntry>(Tree->VectorizableTree));
+    TreeEntry *Last = Tree->VectorizableTree.back().get();
+    Last->Idx = Tree->VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -1670,7 +1729,7 @@
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = Last;
+        Tree->ScalarToTreeEntry[VL[i]] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
       unsigned Lane = 0;
@@ -1683,52 +1742,46 @@
       assert((!Bundle.getValue() || Lane == VL.size()) &&
              "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      Tree->MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      Tree->VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(
+          Last);
+    }
 
     return Last;
   }
 
-  /// -- Vectorization State --
-  /// Holds all of the tree entries.
-  TreeEntry::VecTreeTy VectorizableTree;
-
 #ifndef NDEBUG
   /// Debug printer.
   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
-    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
-      VectorizableTree[Id]->dump();
+    for (unsigned Id = 0, IdE = Tree->VectorizableTree.size(); Id != IdE;
+         ++Id) {
+      Tree->VectorizableTree[Id]->dump();
       dbgs() << "\n";
     }
   }
 #endif
 
   TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
+    auto I = Tree->ScalarToTreeEntry.find(V);
+    if (I != Tree->ScalarToTreeEntry.end())
       return I->second;
     return nullptr;
   }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
+    auto I = Tree->ScalarToTreeEntry.find(V);
+    if (I != Tree->ScalarToTreeEntry.end())
       return I->second;
     return nullptr;
   }
 
-  /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
-
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
-  /// A list of scalars that we found that we need to keep as scalars.
-  ValueSet MustGather;
-
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -1745,6 +1798,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  int getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1789,12 +1845,6 @@
   /// eventually when the BoUpSLP is destructed.
   DenseMap<Instruction *, bool> DeletedInstructions;
 
-  /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User). External User
-  /// can be nullptr, it means that this Internal Scalar will be used later,
-  /// after vectorization.
-  UserList ExternalUses;
-
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2192,8 +2242,8 @@
     int SchedulingRegionID = 1;
   };
 
-  /// Attaches the BlockScheduling structures to basic blocks.
-  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
@@ -2227,13 +2277,98 @@
     }
   };
 
-  /// Contains orders of operations along with the number of bundles that have
-  /// operations in this order. It stores only those orders that require
-  /// reordering, if reordering is not required it is counted using \a
-  /// NumOpsWantToKeepOriginalOrder.
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
-  /// Number of bundles that do not require reordering.
-  unsigned NumOpsWantToKeepOriginalOrder = 0;
+  /// Tree state that created by 'buildTree'.
+  struct TreeState {
+    using TreeStateTy = SmallVector<std::unique_ptr<TreeState>, 2>;
+
+    /// -- Vectorization State --
+    /// Holds all of the tree entries.
+    TreeEntry::VecTreeTy VectorizableTree;
+
+    /// Maps a specific scalar to its tree entry.
+    SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
+
+    /// Tree entries that should not be vectorized due to throttling.
+    SmallVector<TreeEntry *, 2> RemovedOperations;
+
+    /// Contains orders of operations along with the number of bundles that have
+    /// operations in this order. It stores only those orders that require
+    /// reordering, if reordering is not required it is counted using \a
+    /// NumOpsWantToKeepOriginalOrder.
+    DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>
+        NumOpsWantToKeepOrder;
+    /// Number of bundles that do not require reordering.
+    unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+    /// A list of scalars that we found that we need to keep as scalars.
+    ValueSet MustGather;
+
+    /// Raw cost of all elemts in the tree.
+    int RawTreeCost = 0;
+
+    /// Final cost of the tree.
+    int TreeCost = 0;
+
+    /// Partail cost of the tree.
+    int PartialCost = 0;
+
+    /// Indicate that no CallInst found in the tree and we don't need to
+    /// calculate spill cost.
+    bool NoCallInst = true;
+
+    /// True, if we have calucalte tree cost for the tree.
+    bool IsCostSumReady = false;
+
+    /// A list of values that need to extracted out of the tree.
+    /// This list holds pairs of (Internal Scalar : External User). External
+    /// User can be nullptr, it means that this Internal Scalar will be used
+    /// later, after vectorization.
+    UserList ExternalUses;
+
+    /// Current operations width to vectorize.
+    unsigned BundleWidth = 0;
+
+    /// Internal tree oprations proposed to be vectorized values use.
+    SmallDenseMap<Value *, UserList> InternalTreeUses;
+
+    /// Attaches the BlockScheduling structures to basic blocks.
+    MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+    /// A map of scalar integer values to the smallest bit width with which they
+    /// can legally be represented. The values map to (width, signed) pairs,
+    /// where "width" indicates the minimum bit width and "signed" is True if
+    /// the value must be signed-extended, rather than zero-extended, back to
+    /// its original width.
+    MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+
+    /// Clear the internal data structures that are created by 'buildTree'.
+    void deleteTree() {
+      VectorizableTree.clear();
+      ScalarToTreeEntry.clear();
+      MustGather.clear();
+      ExternalUses.clear();
+      InternalTreeUses.clear();
+      RemovedOperations.clear();
+      NumOpsWantToKeepOrder.clear();
+      NumOpsWantToKeepOriginalOrder = 0;
+      for (auto &Iter : BlocksSchedules) {
+        BlockScheduling *BS = Iter.second.get();
+        BS->clear();
+      }
+      MinBWs.clear();
+      NoCallInst = true;
+      RawTreeCost = 0;
+      TreeCost = 0;
+      PartialCost = 0;
+      IsCostSumReady = false;
+    }
+  };
+
+  // Previous trees that might be worth to vectorize.
+  TreeState::TreeStateTy BuiltTrees;
+
+  // Current tree that we consider.
+  TreeState *Tree = nullptr;
 
   // Analysis and block reference.
   Function *F;
@@ -2287,7 +2422,7 @@
   };
 
   static NodeRef getEntryNode(BoUpSLP &R) {
-    return R.VectorizableTree[0].get();
+    return R.Tree->VectorizableTree[0].get();
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
@@ -2315,14 +2450,14 @@
   };
 
   static nodes_iterator nodes_begin(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.begin());
+    return nodes_iterator(R->Tree->VectorizableTree.begin());
   }
 
   static nodes_iterator nodes_end(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.end());
+    return nodes_iterator(R->Tree->VectorizableTree.end());
   }
 
-  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+  static unsigned size(BoUpSLP *R) { return R->Tree->VectorizableTree.size(); }
 };
 
 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
@@ -2340,7 +2475,7 @@
     for (auto V : Entry->Scalars) {
       OS << *V;
       if (std::any_of(
-              R->ExternalUses.begin(), R->ExternalUses.end(),
+              R->Tree->ExternalUses.begin(), R->Tree->ExternalUses.end(),
               [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
         OS << " <extract>";
       OS << "\n";
@@ -2392,14 +2527,14 @@
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
-  deleteTree();
+  Tree->deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2421,7 +2556,7 @@
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+        Tree->ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
       }
       for (User *U : Scalar->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -2441,6 +2576,7 @@
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            Tree->InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
             continue;
           }
         }
@@ -2451,7 +2587,7 @@
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+        Tree->ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
       }
     }
   }
@@ -2535,7 +2671,7 @@
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
   for (Value *V : VL) {
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+    if (Tree->MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
@@ -2579,7 +2715,7 @@
     VL = UniqueValues;
   }
 
-  auto &BSRef = BlocksSchedules[BB];
+  auto &BSRef = Tree->BlocksSchedules[BB];
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
 
@@ -2644,14 +2780,14 @@
       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOriginalOrder;
+        ++Tree->NumOpsWantToKeepOriginalOrder;
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       if (!CurrentOrder.empty()) {
@@ -2665,7 +2801,7 @@
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
         auto StoredCurrentOrderAndNum =
-            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies,
@@ -2674,7 +2810,7 @@
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -2739,14 +2875,15 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            auto I =
+                Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -3001,7 +3138,7 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
@@ -3009,7 +3146,8 @@
             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
           } else {
             // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            auto I =
+                Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++(I->getSecond());
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -3155,6 +3293,63 @@
   }
 }
 
+void BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize)
+      continue;
+    // For all canceled operations we should consider the possibility of
+    // use by with non-canceled operations and for that, it requires
+    // to populate ExternalUser list with canceled elements.
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        TreeEntry *UserTE = getTreeEntry(U);
+        if (!UserTE || UserTE->State != TreeEntry::ProposedToGather)
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        Tree->ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  }
+  // Canceling unprofitable elements.
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    Entry->State = TreeEntry::NeedToGather;
+    for (Value *V : Entry->Scalars) {
+      Tree->ScalarToTreeEntry.erase(V);
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+#endif
+    }
+  }
+}
+
+bool BoUpSLP::tryPartialVectorization() {
+  if (BuiltTrees.size() == 1)
+    return false;
+  Tree = BuiltTrees.front().get();
+  vectorizeTree();
+  LLVM_DEBUG(dbgs() << "SLP: Decided to partially vectorize tree with cost: "
+                    << Tree->PartialCost << ".\n");
+  return true;
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3256,7 +3451,7 @@
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return Tree->ScalarToTreeEntry.count(U) > 0;
          });
 }
 
@@ -3313,7 +3508,7 @@
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
-  if (E->State == TreeEntry::NeedToGather) {
+  if (E->State != TreeEntry::Vectorize) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
@@ -3331,7 +3526,7 @@
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+              !Tree->ScalarToTreeEntry.count(V)) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(V)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -3666,25 +3861,26 @@
 
 bool BoUpSLP::isFullyVectorizableTinyTree() const {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
-                    << VectorizableTree.size() << " is fully vectorizable .\n");
+                    << Tree->VectorizableTree.size()
+                    << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 &&
-      VectorizableTree[0]->State == TreeEntry::Vectorize)
+  if (Tree->VectorizableTree.size() == 1 &&
+      Tree->VectorizableTree[0]->State == TreeEntry::Vectorize)
     return true;
 
-  if (VectorizableTree.size() != 2)
+  if (Tree->VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
-  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
-      (allConstant(VectorizableTree[1]->Scalars) ||
-       isSplat(VectorizableTree[1]->Scalars)))
+  if (Tree->VectorizableTree[0]->State == TreeEntry::Vectorize &&
+      (allConstant(Tree->VectorizableTree[1]->Scalars) ||
+       isSplat(Tree->VectorizableTree[1]->Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
-      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+  if (Tree->VectorizableTree[0]->State != TreeEntry::Vectorize ||
+      Tree->VectorizableTree[1]->State != TreeEntry::Vectorize)
     return false;
 
   return true;
@@ -3726,16 +3922,16 @@
   if (RdxOpcode != Instruction::Or)
     return false;
 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = Tree->VectorizableTree[0]->Scalars[0];
   return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
 }
 
 bool BoUpSLP::isLoadCombineCandidate() const {
   // Peek through a final sequence of stores and check if all operations are
   // likely to be load-combined.
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+  unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : Tree->VectorizableTree[0]->Scalars) {
     Value *X;
     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
         !isLoadCombineCandidateImpl(X, NumElts, TTI))
@@ -3747,7 +3943,7 @@
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
-  if (VectorizableTree.size() >= MinTreeSize)
+  if (Tree->VectorizableTree.size() >= MinTreeSize)
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -3755,8 +3951,8 @@
   if (isFullyVectorizableTinyTree())
     return false;
 
-  assert(VectorizableTree.empty()
-             ? ExternalUses.empty()
+  assert(Tree->VectorizableTree.empty()
+             ? Tree->ExternalUses.empty()
              : true && "We shouldn't have any external users");
 
   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
@@ -3764,12 +3960,11 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() const {
+int BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   int Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
@@ -3782,7 +3977,7 @@
   // their order does not matter, as long as all instructions in a basic block
   // are grouped together. Using dominance ensures a deterministic order.
   SmallVector<Instruction *, 16> OrderedScalars;
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
@@ -3801,7 +3996,8 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) &&
+          Tree->ScalarToTreeEntry.find(&*J) != Tree->ScalarToTreeEntry.end())
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -3834,9 +4030,10 @@
     }
 
     if (NumCalls) {
+      Tree->NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
-        V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
+        V.push_back(FixedVectorType::get(II->getType(), Tree->BundleWidth));
       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
     }
 
@@ -3846,15 +4043,136 @@
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), Tree->BundleWidth);
+  Value *ScalarRoot = Tree->VectorizableTree.front()->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
 
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+int BoUpSLP::getExtractCost() const {
+  int ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = Tree->InternalTreeUses.find(V);
+      if (It != Tree->InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : Tree->ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    int Cost = getExtractOperationCost(EU);
+    ExtractCost += Cost;
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getInsertCost() {
+  int InsertCost = 0;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      if (llvm::any_of(Inst->users(), [this](User *Op) {
+            return Tree->ScalarToTreeEntry.count(Op) > 0;
+          })) {
+        InsertCost += getEntryCost(Entry);
+        break;
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::findSubTree(int UserCost) {
+  auto Cmp = [](const TreeEntry *LHS, const TreeEntry *RHS) {
+    return LHS->Cost > RHS->Cost;
+  };
+  std::set<TreeEntry *, decltype(Cmp)> Vec(Cmp);
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize || Entry->Cost <= 0 || !Entry->Idx)
+      continue;
+    Vec.insert(Entry);
+  }
+  if (Vec.size() > MaxCostsRecalculations) {
+    std::set<llvm::slpvectorizer::BoUpSLP::TreeEntry *>::iterator It =
+        Vec.begin();
+    std::advance(It, (unsigned)MaxCostsRecalculations);
+    Vec.erase(It, Vec.end());
+  }
+
+  int Sum = 0;
+  for (TreeEntry *Entry : Vec)
+    Sum += Entry->Cost;
+  // Avoid reducing the tree if there is no potential room to reduce.
+  if ((Tree->TreeCost - UserCost - Sum) >= -SLPCostThreshold)
+    return false;
+
+  for (TreeEntry *T : Vec) {
+    T->State = TreeEntry::ProposedToGather;
+    for (Value *V : T->Scalars) {
+      Tree->MustGather.insert(V);
+      Tree->ExternalUses.erase(
+          llvm::remove_if(Tree->ExternalUses,
+                          [V](ExternalUser &EU) { return EU.Scalar == V; }),
+          Tree->ExternalUses.end());
+    }
+    Tree->PartialCost = getTreeCost() - UserCost;
+    Tree->RemovedOperations.push_back(T);
+    if (Tree->PartialCost < -SLPCostThreshold) {
+      cutTree();
+      return true;
+    }
+  }
+  return false;
+}
+
+int BoUpSLP::getRawTreeCost() {
+  int CostSum = 0;
+  Tree->BundleWidth = Tree->VectorizableTree.front()->Scalars.size();
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << Tree->VectorizableTree.size() << ".\n");
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -3868,64 +4186,69 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+    if (TE.State != TreeEntry::Vectorize &&
+        llvm::any_of(llvm::drop_begin(Tree->VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State != TreeEntry::Vectorize &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
+    CostSum += TE.Cost;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State != TreeEntry::Vectorize)
+        continue;
+      int GatherCost = 0;
+      for (TreeEntry *Gather : TE->UseEntries)
+        if (Gather->State != TreeEntry::Vectorize)
+          GatherCost += Gather->Cost;
+      TE->Cost += GatherCost;
+    }
+  return CostSum;
+}
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+int BoUpSLP::getTreeCost() {
+  int CostSum;
+  if (!Tree->IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    Tree->RawTreeCost = CostSum;
+  } else {
+    CostSum = Tree->RawTreeCost;
+  }
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = FixedVectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State == TreeEntry::ProposedToGather)
+        CostSum -= TE->Cost;
     }
-  }
 
-  int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  int ExtractCost = getExtractCost();
+  int SpillCost = 0;
+  if (!Tree->NoCallInst || !Tree->IsCostSumReady)
+    SpillCost = getSpillCost();
+  assert((!Tree->NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+  if (!Tree->IsCostSumReady)
+    Tree->IsCostSumReady = true;
+  int InsertCost = getInsertCost();
+  int Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+  Tree->TreeCost = Cost;
 
 #ifndef NDEBUG
   SmallString<256> Str;
-  {
-    raw_svector_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
@@ -4003,9 +4326,9 @@
   // scheduled, and the last instruction is VL.back(). So we start with
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
-  if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+  if (Tree->BlocksSchedules.count(BB)) {
+    auto *Bundle = Tree->BlocksSchedules[BB]->getScheduleData(
+        E->isOneOf(E->Scalars.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -4073,7 +4396,7 @@
               std::distance(E->ReuseShuffleIndices.begin(),
                             llvm::find(E->ReuseShuffleIndices, FoundLane));
         }
-        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+        Tree->ExternalUses.emplace_back(VL[i], Insrt, FoundLane);
       }
     }
   }
@@ -4428,7 +4751,7 @@
       // future.
       Value *PO = LI->getPointerOperand();
       if (getTreeEntry(PO))
-        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+        Tree->ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
 
       LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       Value *V = propagateMetadata(LI, E->Scalars);
@@ -4473,7 +4796,7 @@
       // ExternalUses to make sure that an extract will be generated in the
       // future.
       if (getTreeEntry(ScalarPtr))
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+        Tree->ExternalUses.emplace_back(ScalarPtr, cast<User>(VecPtr), 0);
 
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
@@ -4577,7 +4900,7 @@
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
       if (ScalarArg && getTreeEntry(ScalarArg))
-        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+        Tree->ExternalUses.emplace_back(ScalarArg, cast<User>(V), 0);
 
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses) {
@@ -4671,28 +4994,39 @@
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
-  for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+  for (auto &BSIter : Tree->BlocksSchedules) {
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!Tree->RemovedOperations.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+  auto *VectorRoot = vectorizeTree(Tree->VectorizableTree.front().get());
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize && !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+  auto *ScalarRoot = Tree->VectorizableTree[0]->Scalars[0];
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
-    auto BundleWidth = VectorizableTree[0]->Scalars.size();
+    Tree->BundleWidth = Tree->VectorizableTree[0]->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    auto *VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
-    VectorizableTree[0]->VectorizedValue = Trunc;
+    Tree->VectorizableTree[0]->VectorizedValue = Trunc;
   }
 
-  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+  LLVM_DEBUG(dbgs() << "SLP: Extracting " << Tree->ExternalUses.size()
                     << " values .\n");
 
   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
@@ -4706,7 +5040,7 @@
   };
 
   // Extract all of the elements with the external uses.
-  for (const auto &ExternalUse : ExternalUses) {
+  for (const auto &ExternalUse : Tree->ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
@@ -4785,7 +5119,7 @@
   }
 
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -4800,7 +5134,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && Tree->RemovedOperations.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -4818,7 +5154,13 @@
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  return VectorizableTree[0]->VectorizedValue;
+  // Erase all saved trees after vectorization, except the current.
+  llvm::erase_if(
+      BuiltTrees,
+      [this](std::unique_ptr<TreeState> &T) { return T.get() != Tree; }),
+      BuiltTrees.end();
+
+  return Tree->VectorizableTree[0]->VectorizedValue;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -5286,6 +5628,31 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (TreeEntry *Entry : Tree->RemovedOperations) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5495,11 +5862,11 @@
 void BoUpSLP::computeMinimumValueSizes() {
   // If there are no external uses, the expression tree must be rooted by a
   // store. We can't demote in-memory values, so there is nothing to do here.
-  if (ExternalUses.empty())
+  if (Tree->ExternalUses.empty())
     return;
 
   // We only attempt to truncate integer expressions.
-  auto &TreeRoot = VectorizableTree[0]->Scalars;
+  auto &TreeRoot = Tree->VectorizableTree[0]->Scalars;
   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
   if (!TreeRootIT)
     return;
@@ -5511,7 +5878,7 @@
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
-  for (auto &EU : ExternalUses)
+  for (auto &EU : Tree->ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
   if (!Expr.empty())
@@ -5520,7 +5887,7 @@
   // Collect the scalar values of the vectorizable expression. We will use this
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
-  for (auto &EntryPtr : VectorizableTree)
+  for (auto &EntryPtr : Tree->VectorizableTree)
     Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
@@ -5766,6 +6133,12 @@
                         << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
+
+    // Partially vectorize trees after all full vectorization is done,
+    // otherwise, we could prevent more profitable full vectorization with
+    // smaller vector sizes.
+    if (SLPThrottling)
+      Changed |= R.tryPartialVectorization();
   }
 
   if (Changed) {
@@ -5823,6 +6196,8 @@
     R.vectorizeTree();
     return true;
   }
+  if (SLPThrottling && R.findSubTree())
+    R.saveTree();
 
   return false;
 }
@@ -6066,6 +6441,7 @@
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost();
+      unsigned UserCost = 0;
       CandidateFound = true;
       if (CompensateUseCost) {
         // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6095,7 +6471,6 @@
         // Switching to the TTI interface might help a bit.
         // Alternative solution could be pattern-match to detect a no-op or
         // shuffle.
-        unsigned UserCost = 0;
         for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
           auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
           if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6122,6 +6497,8 @@
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else if (SLPThrottling && R.findSubTree(UserCost)) {
+        R.saveTree();
       }
     }
   }
@@ -6905,16 +7282,17 @@
       int TreeCost = V.getTreeCost();
       int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       int Cost = TreeCost + ReductionCost;
-      if (Cost >= -SLPCostThreshold) {
-          V.getORE()->emit([&]() {
-              return OptimizationRemarkMissed(
-                         SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
+      if (Cost >= -SLPCostThreshold &&
+          (!SLPThrottling || !V.findSubTree(-ReductionCost))) {
+        V.getORE()->emit([&]() {
+          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+                                          cast<Instruction>(VL[0]))
                      << "Vectorizing horizontal reduction is possible"
                      << "but not beneficial with cost "
                      << ore::NV("Cost", Cost) << " and threshold "
                      << ore::NV("Threshold", -SLPCostThreshold);
-          });
-          break;
+        });
+        break;
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -204,11 +204,15 @@
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
 ; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
 ; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0
+; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> undef, i8 [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1
+; MAX-COST-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[P4]], i32 2
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[P6]], i32 3
+; MAX-COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
 ; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
 ; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -220,19 +224,21 @@
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; MAX-COST-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; MAX-COST-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> undef, i1 [[TMP9]], i32 0
+; MAX-COST-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; MAX-COST-NEXT:    [[TMP12:%.*]] = insertelement <4 x i1> [[TMP10]], i1 [[TMP11]], i32 1
+; MAX-COST-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; MAX-COST-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP13]], i32 2
+; MAX-COST-NEXT:    [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP8]], i32 3
+; MAX-COST-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
+; MAX-COST-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], [[P27]]
+; MAX-COST-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP19]], -5
 ; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
 ; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -116,12 +116,15 @@
 ; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
 ; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[T15]]
 ;
   %t0 = bitcast <4 x float>* %x to i64*
Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,34 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
-; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 2
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -1,24 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-throttle=false < %s | FileCheck %s --check-prefixes=NO_THROTTLE
 
 define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 {
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
+; NO_THROTTLE-LABEL: @rftbsub(
+; NO_THROTTLE-NEXT:  entry:
+; NO_THROTTLE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
+; NO_THROTTLE-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; NO_THROTTLE-NEXT:    [[TMP1:%.*]] = or i64 2, 1
+; NO_THROTTLE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
+; NO_THROTTLE-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
+; NO_THROTTLE-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; NO_THROTTLE-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
+; NO_THROTTLE-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
+; NO_THROTTLE-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
+; NO_THROTTLE-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
+; NO_THROTTLE-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
+; NO_THROTTLE-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
+; NO_THROTTLE-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; NO_THROTTLE-NEXT:    unreachable
+;
 entry:
   %arrayidx6 = getelementptr inbounds double, double* %a, i64 2
   %0 = load double, double* %arrayidx6, align 8