Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
@@ -117,8 +118,17 @@
                               "number "));
 
 static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
-                   cl::desc("Attempt to vectorize horizontal reductions"));
+    ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                       cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool>
+    SLPThrottling("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<int>
+    MaxCostsRecalculations("slp-throttling-budget", cl::init(128), cl::Hidden,
+                           cl::desc("Limit the total number of nodes for cost "
+                                    "recalculations during throttling"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
@@ -557,6 +567,8 @@
       MinVecRegSize = MinVectorRegSizeOption;
     else
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
   /// Vectorize the tree that starts with the elements in \p VL.
@@ -570,7 +582,55 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost() const;
+  int getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost();
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  int getInsertCost() const;
+
+  /// Cut given path until it might be good to vectorize.
+  bool cutPath(int &Cost, SetVector<TreeEntry *> &Path);
+
+  /// Find a non-gathering leaf node from current node C and record the path
+  /// on the way.
+  void findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> scalar form
+  ///      |
+  ///   <bundle 2> scalar form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> scalar form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf
+  bool findSubTree();
+
+  /// Get raw summary of all elements of the tree.
+  int getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
@@ -589,22 +649,13 @@
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
-  /// Clear the internal data structures that are created by 'buildTree'.
-  void deleteTree() {
-    VectorizableTree.clear();
-    ScalarToTreeEntry.clear();
-    MustGather.clear();
-    ExternalUses.clear();
-    NumOpsWantToKeepOrder.clear();
-    NumOpsWantToKeepOriginalOrder = 0;
-    for (auto &Iter : BlocksSchedules) {
-      BlockScheduling *BS = Iter.second.get();
-      BS->clear();
-    }
-    MinBWs.clear();
+  /// Save current tree for possible later vectorization.
+  void saveTree() {
+    BuiltTrees.push_back(std::make_unique<TreeState>());
+    Tree = BuiltTrees.back().get();
   }
 
-  unsigned getTreeSize() const { return VectorizableTree.size(); }
+  unsigned getTreeSize() const { return Tree->VectorizableTree.size(); }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
@@ -612,13 +663,13 @@
   /// \returns The best order of instructions for vectorization.
   Optional<ArrayRef<unsigned>> bestOrder() const {
     auto I = std::max_element(
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+        Tree->NumOpsWantToKeepOrder.begin(), Tree->NumOpsWantToKeepOrder.end(),
+        [](const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D1,
+           const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D2) {
           return D1.second < D2.second;
         });
-    if (I == NumOpsWantToKeepOrder.end() ||
-        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+    if (I == Tree->NumOpsWantToKeepOrder.end() ||
+        I->getSecond() <= Tree->NumOpsWantToKeepOriginalOrder)
       return None;
 
     return makeArrayRef(I->getFirst());
@@ -657,6 +708,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Estimate the subtree not just from a cost perspective, but functional.
+  bool isGoodSubTreeToVectorize() const;
+
   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
   /// the IR optimizer, so we do not want to alter the pattern. For example,
@@ -666,6 +720,12 @@
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Try to cut the tree to make it partially vectorizable.
+  bool cutTree();
+
+  /// Try partially vectorize the tree via throttling.
+  bool tryPartialVectorization();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1447,7 +1507,7 @@
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
-    enum EntryState { Vectorize, NeedToGather };
+    enum EntryState { Vectorize, NeedToGather, ProposedToGather };
     EntryState State;
 
     /// Does this sequence require some shuffling?
@@ -1456,6 +1516,12 @@
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
 
+    /// Cost of this tree entry.
+    int Cost = 0;
+
+    /// Extract cost for this entry.
+    int ExtractCost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1468,6 +1534,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1578,6 +1647,13 @@
       return true;
     }
 
+    // Find nodes with more than one use.
+    bool isBranch() const {
+      return llvm::count_if(UseEntries, [this](TreeEntry *Next) {
+               return (Next->Idx != Idx && Next->State == TreeEntry::Vectorize);
+             }) > 1;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -1598,6 +1674,9 @@
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case ProposedToGather:
+        dbgs() << "ProposedToGather\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -1640,19 +1719,20 @@
                           ArrayRef<unsigned> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
     bool Vectorized = (bool)Bundle;
-    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
-    TreeEntry *Last = VectorizableTree.back().get();
-    Last->Idx = VectorizableTree.size() - 1;
+    Tree->VectorizableTree.push_back(std::make_unique<TreeEntry>(Tree->VectorizableTree));
+    TreeEntry *Last = Tree->VectorizableTree.back().get();
+    Last->Idx = Tree->VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     Last->setOperations(S);
+    Last->ExtractCost = 0;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = Last;
+        Tree->ScalarToTreeEntry[VL[i]] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
       unsigned Lane = 0;
@@ -1665,49 +1745,41 @@
       assert((!Bundle.getValue() || Lane == VL.size()) &&
              "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      Tree->MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      Tree->VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last);
+    }
 
     return Last;
   }
 
-  /// -- Vectorization State --
-  /// Holds all of the tree entries.
-  TreeEntry::VecTreeTy VectorizableTree;
-
 #ifndef NDEBUG
   /// Debug printer.
   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
-    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
-      VectorizableTree[Id]->dump();
+    for (unsigned Id = 0, IdE = Tree->VectorizableTree.size(); Id != IdE; ++Id) {
+      Tree->VectorizableTree[Id]->dump();
       dbgs() << "\n";
     }
   }
 #endif
 
   TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
+    auto I = Tree->ScalarToTreeEntry.find(V);
+    if (I != Tree->ScalarToTreeEntry.end())
       return I->second;
     return nullptr;
   }
 
   const TreeEntry *getTreeEntry(Value *V) const {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
+    auto I = Tree->ScalarToTreeEntry.find(V);
+    if (I != Tree->ScalarToTreeEntry.end())
       return I->second;
     return nullptr;
   }
 
-  /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
-
-  /// A list of scalars that we found that we need to keep as scalars.
-  ValueSet MustGather;
-
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -1724,6 +1796,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  int getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1768,12 +1843,6 @@
   /// eventually when the BoUpSLP is destructed.
   DenseMap<Instruction *, bool> DeletedInstructions;
 
-  /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User). External User
-  /// can be nullptr, it means that this Internal Scalar will be used later,
-  /// after vectorization.
-  UserList ExternalUses;
-
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2171,8 +2240,8 @@
     int SchedulingRegionID = 1;
   };
 
-  /// Attaches the BlockScheduling structures to basic blocks.
-  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
@@ -2206,13 +2275,112 @@
     }
   };
 
-  /// Contains orders of operations along with the number of bundles that have
-  /// operations in this order. It stores only those orders that require
-  /// reordering, if reordering is not required it is counted using \a
-  /// NumOpsWantToKeepOriginalOrder.
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
-  /// Number of bundles that do not require reordering.
-  unsigned NumOpsWantToKeepOriginalOrder = 0;
+  /// Tree state that created by 'buildTree'.
+  struct TreeState {
+    using TreeStateTy = SmallVector<std::unique_ptr<TreeState>, 8>;
+
+    /// -- Vectorization State --
+    /// Holds all of the tree entries.
+    TreeEntry::VecTreeTy VectorizableTree;
+
+    /// Maps a specific scalar to its tree entry.
+    SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
+
+    /// A list of scalars that we found that we need to keep as scalars.
+    ValueSet MustGather;
+
+    /// A list of values that need to extracted out of the tree.
+    /// This list holds pairs of (Internal Scalar : External User). External
+    /// User can be nullptr, it means that this Internal Scalar will be used
+    /// later, after vectorization.
+    UserList ExternalUses;
+
+    /// Internal tree oprations proposed to be vectorized values use.
+    SmallDenseMap<Value *, UserList> InternalTreeUses;
+
+    /// Current operations width to vectorize.
+    unsigned BundleWidth = 0;
+
+    /// Tree entries that should not be vectorized due to throttling.
+    SmallVector<TreeEntry *, 2> RemovedOperations;
+
+    /// Contains orders of operations along with the number of bundles that have
+    /// operations in this order. It stores only those orders that require
+    /// reordering, if reordering is not required it is counted using \a
+    /// NumOpsWantToKeepOriginalOrder.
+    DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>
+        NumOpsWantToKeepOrder;
+    /// Number of bundles that do not require reordering.
+    unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+    /// Attaches the BlockScheduling structures to basic blocks.
+    MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+    /// A map of scalar integer values to the smallest bit width with which they
+    /// can legally be represented. The values map to (width, signed) pairs,
+    /// where "width" indicates the minimum bit width and "signed" is True if
+    /// the value must be signed-extended, rather than zero-extended, back to
+    /// its original width.
+    MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+
+    /// Tree values proposed to be vectorized.
+    ValueSet ScalarsToVec;
+
+    /// Tree values once considered to be vectorized, but later with throttling
+    /// decided to stay in a scalar form.
+    ValueSet VecToScalars;
+
+    /// Total cost of inserts in the tree for a particular value.
+    SmallDenseMap<Value *, int> VecInserts;
+
+    /// Number of times in nodes that we already recalulated cost of
+    /// the subtree during throtteling.
+    int CostsRecalculations = 0;
+
+    /// Indicate that no CallInst found in the tree and we don't need to
+    /// calculate spill cost.
+    bool NoCallInst = true;
+
+    /// Raw cost of all elemts in the tree.
+    int RawTreeCost = 0;
+
+    /// Total cost of tree including raw tree cost and extract, spill cost, etc.
+    int TotalCost = 0;
+
+    /// True, if we have calucalte tree cost for the tree.
+    bool IsCostSumReady = false;
+
+    /// Clear the internal data structures that are created by 'buildTree'.
+    void deleteTree() {
+      VectorizableTree.clear();
+      ScalarToTreeEntry.clear();
+      MustGather.clear();
+      ExternalUses.clear();
+      InternalTreeUses.clear();
+      RemovedOperations.clear();
+      NumOpsWantToKeepOrder.clear();
+      NumOpsWantToKeepOriginalOrder = 0;
+      for (auto &Iter : BlocksSchedules) {
+        BlockScheduling *BS = Iter.second.get();
+        BS->clear();
+      }
+      MinBWs.clear();
+      ScalarsToVec.clear();
+      VecToScalars.clear();
+      VecInserts.clear();
+      CostsRecalculations = 0;
+      NoCallInst = true;
+      TotalCost = 0;
+      RawTreeCost = 0;
+      IsCostSumReady = false;
+    }
+  };
+
+  // Previous trees that might be worth to vectorize.
+  TreeState::TreeStateTy BuiltTrees;
+
+  // Current tree that we consider.
+  TreeState *Tree = nullptr;
 
   // Analysis and block reference.
   Function *F;
@@ -2232,13 +2400,6 @@
 
   /// Instruction builder to construct the vectorized tree.
   IRBuilder<> Builder;
-
-  /// A map of scalar integer values to the smallest bit width with which they
-  /// can legally be represented. The values map to (width, signed) pairs,
-  /// where "width" indicates the minimum bit width and "signed" is True if the
-  /// value must be signed-extended, rather than zero-extended, back to its
-  /// original width.
-  MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
 };
 
 } // end namespace slpvectorizer
@@ -2266,7 +2427,7 @@
   };
 
   static NodeRef getEntryNode(BoUpSLP &R) {
-    return R.VectorizableTree[0].get();
+    return R.Tree->VectorizableTree[0].get();
   }
 
   static ChildIteratorType child_begin(NodeRef N) {
@@ -2294,14 +2455,14 @@
   };
 
   static nodes_iterator nodes_begin(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.begin());
+    return nodes_iterator(R->Tree->VectorizableTree.begin());
   }
 
   static nodes_iterator nodes_end(BoUpSLP *R) {
-    return nodes_iterator(R->VectorizableTree.end());
+    return nodes_iterator(R->Tree->VectorizableTree.end());
   }
 
-  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+  static unsigned size(BoUpSLP *R) { return R->Tree->VectorizableTree.size(); }
 };
 
 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
@@ -2319,7 +2480,7 @@
     for (auto V : Entry->Scalars) {
       OS << *V;
       if (std::any_of(
-              R->ExternalUses.begin(), R->ExternalUses.end(),
+              R->Tree->ExternalUses.begin(), R->Tree->ExternalUses.end(),
               [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
         OS << " <extract>";
       OS << "\n";
@@ -2370,14 +2531,14 @@
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
-  deleteTree();
+  Tree->deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2399,7 +2560,7 @@
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+        Tree->ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
       }
       for (User *U : Scalar->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -2419,6 +2580,7 @@
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            Tree->InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
             continue;
           }
         }
@@ -2429,7 +2591,7 @@
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
                           << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+        Tree->ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
       }
     }
   }
@@ -2513,7 +2675,7 @@
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
   for (Value *V : VL) {
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+    if (Tree->MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
       return;
@@ -2557,7 +2719,7 @@
     VL = UniqueValues;
   }
 
-  auto &BSRef = BlocksSchedules[BB];
+  auto &BSRef = Tree->BlocksSchedules[BB];
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
 
@@ -2622,14 +2784,14 @@
       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOriginalOrder;
+        ++Tree->NumOpsWantToKeepOriginalOrder;
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       if (!CurrentOrder.empty()) {
@@ -2643,7 +2805,7 @@
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
         auto StoredCurrentOrderAndNum =
-            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies,
@@ -2652,7 +2814,7 @@
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
         Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
+        Tree->VectorizableTree.back()->setOperand(0, Op0);
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -2717,14 +2879,14 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            auto I = Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -2979,7 +3141,7 @@
         if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
+            ++Tree->NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
@@ -2987,7 +3149,7 @@
             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
           } else {
             // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
+            auto I = Tree->NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++(I->getSecond());
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -3126,6 +3288,63 @@
   }
 }
 
+bool BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+  if (!isGoodSubTreeToVectorize())
+    return false;
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize)
+      VecNodes.push_back(Entry);
+  }
+  if (VecNodes.size() <= 2)
+    return false;
+  // Canceling unprofitable elements.
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+    if (Entry->State == TreeEntry::ProposedToGather) {
+      Entry->State = TreeEntry::NeedToGather;
+      for (Value *V : Entry->Scalars) {
+        LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                          << " out of proposed to vectorize.\n");
+      }
+    }
+  }
+  // For all canceled operations we should consider the possibility of
+  // use by with non-canceled operations and for that, it requires
+  // to populate ExternalUser list with canceled elements.
+  for (TreeEntry *Entry : VecNodes)
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        if (!Tree->VecToScalars.count(U))
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = cast<Instruction>(U);
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        Tree->ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  return true;
+}
+
+bool BoUpSLP::tryPartialVectorization() {
+  if (BuiltTrees.size() < 2)
+    return false;
+  Tree = BuiltTrees.front().get();
+  vectorizeTree();
+  LLVM_DEBUG(dbgs() << "SLP: Decided to partially vectorize tree with cost: "
+                    << Tree->TotalCost << ".\n");
+  return true;
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3223,7 +3442,7 @@
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return Tree->ScalarToTreeEntry.count(U) > 0;
          });
 }
 
@@ -3239,9 +3458,10 @@
 
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
+  if (Tree->MinBWs.count(VL[0]))
     VecTy = VectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+        IntegerType::get(F->getContext(), Tree->MinBWs[VL[0]].first),
+        VL.size());
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
@@ -3268,7 +3488,7 @@
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+              !Tree->ScalarToTreeEntry.count(V)) {
             auto *IO = cast<ConstantInt>(
                 cast<ExtractElementInst>(V)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -3378,7 +3598,7 @@
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
-      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+      if (!Tree->MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
                   TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
       }
@@ -3599,25 +3819,26 @@
 
 bool BoUpSLP::isFullyVectorizableTinyTree() const {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
-                    << VectorizableTree.size() << " is fully vectorizable .\n");
+                    << Tree->VectorizableTree.size()
+                    << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 &&
-      VectorizableTree[0]->State == TreeEntry::Vectorize)
+  if (Tree->VectorizableTree.size() == 1 &&
+      Tree->VectorizableTree[0]->State == TreeEntry::Vectorize)
     return true;
 
-  if (VectorizableTree.size() != 2)
+  if (Tree->VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
-  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
-      (allConstant(VectorizableTree[1]->Scalars) ||
-       isSplat(VectorizableTree[1]->Scalars)))
+  if (Tree->VectorizableTree[0]->State == TreeEntry::Vectorize &&
+      (allConstant(Tree->VectorizableTree[1]->Scalars) ||
+       isSplat(Tree->VectorizableTree[1]->Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
-      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+  if (Tree->VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+      Tree->VectorizableTree[1]->State == TreeEntry::NeedToGather)
     return false;
 
   return true;
@@ -3627,8 +3848,8 @@
   if (RdxOpcode != Instruction::Or)
     return false;
 
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = Tree->VectorizableTree[0]->Scalars[0];
 
   // Look past the reduction to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
@@ -3663,7 +3884,7 @@
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
-  if (VectorizableTree.size() >= MinTreeSize)
+  if (Tree->VectorizableTree.size() >= MinTreeSize)
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -3671,8 +3892,8 @@
   if (isFullyVectorizableTinyTree())
     return false;
 
-  assert(VectorizableTree.empty()
-             ? ExternalUses.empty()
+  assert(Tree->VectorizableTree.empty()
+             ? Tree->ExternalUses.empty()
              : true && "We shouldn't have any external users");
 
   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
@@ -3680,18 +3901,30 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() const {
+bool BoUpSLP::isGoodSubTreeToVectorize() const {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize)
+      continue;
+    Instruction *Inst = Entry->getMainOp();
+    if (Inst && (isa<BinaryOperator>(Inst) || isa<FPMathOperator>(Inst) ||
+                 isa<CmpInst>(Inst)))
+      return true;
+  }
+  return false;
+}
+
+int BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   int Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
@@ -3704,7 +3937,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && Tree->ScalarsToVec.count(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -3732,14 +3965,14 @@
            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
           &*PrevInstIt != PrevInst)
         NumCalls++;
-
       ++PrevInstIt;
     }
 
     if (NumCalls) {
+      Tree->NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
-        V.push_back(VectorType::get(II->getType(), BundleWidth));
+        V.push_back(VectorType::get(II->getType(), Tree->BundleWidth));
       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
     }
 
@@ -3749,15 +3982,253 @@
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = VectorType::get(EU.Scalar->getType(), Tree->BundleWidth);
+  Value *ScalarRoot = Tree->VectorizableTree.front()->Scalars[0];
+
+  auto It = Tree->MinBWs.find(ScalarRoot);
+  if (It != Tree->MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = VectorType::get(MinTy, Tree->BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
 
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+int BoUpSLP::getExtractCost() {
+  int ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = Tree->InternalTreeUses.find(V);
+      if (It != Tree->InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : Tree->ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    int Cost = getExtractOperationCost(EU);
+    ExtractCost += Cost;
+    if (!Tree->IsCostSumReady) {
+      TreeEntry *TE = getTreeEntry(EU.Scalar);
+      assert(TE && "Incorrect tree state");
+      TE->ExtractCost += Cost;
+    }
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getInsertCost() const {
+  int InsertCost = 0;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      for (Use &U : Inst->operands()) {
+        Value *Op = U.get();
+        if (Tree->ScalarsToVec.count(Op))
+          InsertCost += getGatherCost(V);
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::cutPath(int &Cost, SetVector<TreeEntry *> &Path) {
+  // Decrement nodes one by one until Path is empty or we find a suitable set
+  // of nodes for partial tree vectorization
+  for (TreeEntry *N : Path) {
+    Tree->CostsRecalculations++;
+
+    // Stop if we are over our budget of maximum cost calculations.
+    if (Tree->CostsRecalculations >= MaxCostsRecalculations)
+      break;
+
+    // We are no longer propose to vectorize this node and we substitute
+    // cost of this node from the cost of all vectorizable nodes.
+    assert(N->State == TreeEntry::Vectorize &&
+           "Incorrect node state, visiting twice.");
+    N->State = TreeEntry::ProposedToGather;
+    Cost -= N->Cost;
+    for (Value *V : N->Scalars) {
+      Tree->ScalarsToVec.erase(V);
+      Tree->VecToScalars.insert(V);
+      if (Tree->VecInserts.find(V) != Tree->VecInserts.end()) {
+        Cost -= Tree->VecInserts[V];
+        Tree->VecInserts.erase(V);
+      }
+    }
+    for (Value *V : N->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = Tree->InternalTreeUses.find(V);
+      if (It != Tree->InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          Cost += getExtractOperationCost(IU);
+      }
+      auto *Inst = cast<Instruction>(V);
+      for (Use &U : Inst->operands()) {
+        Value *Op = U.get();
+        if (Tree->ScalarsToVec.count(Op)) {
+          int InsertCost = getGatherCost(V);
+          Tree->VecInserts[Op] = Tree->VecInserts[Op] + InsertCost;
+          Cost += InsertCost;
+        }
+      }
+    }
+    Cost -= N->ExtractCost;
+    int PartialCost = Cost;
+    if (!Tree->NoCallInst)
+      PartialCost += getSpillCost();
+    Tree->RemovedOperations.push_back(N);
+    for (Value *V : N->Scalars) {
+      Tree->ScalarToTreeEntry.erase(V);
+      Tree->MustGather.insert(V);
+      Tree->ExternalUses.erase(
+          llvm::remove_if(Tree->ExternalUses,
+                          [&V](ExternalUser &EU) { return EU.Scalar == V; }),
+          Tree->ExternalUses.end());
+    }
+#ifndef NDEBUG
+    if (Tree->NoCallInst)
+      assert(getSpillCost() == 0 && "Incorrect spill cost");
+    assert(PartialCost == getTreeCost() && "Incorrect partial cost");
+#endif
+    if (PartialCost < -SLPCostThreshold && cutTree()) {
+      LLVM_DEBUG(
+          dbgs() << "SLP: Possible to partially vectorize tree with cost = "
+                 << PartialCost << "\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+void BoUpSLP::findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const {
+  if (!Path.count(C))
+    Path.insert(C);
+  int NonGatherUse;
+  do {
+    NonGatherUse = 0;
+    for (TreeEntry *Next : llvm::reverse(C->UseEntries)) {
+      // Ignore any processed nodes to avoid cycles.
+      if (Next->State != TreeEntry::Vectorize || Path.count(Next) || Next == C)
+        continue;
+      C = Next;
+      Path.insert(C);
+      NonGatherUse++;
+      break;
+    }
+  } while (NonGatherUse != 0);
+}
+
+bool BoUpSLP::findSubTree() {
+  SetVector<TreeEntry *> Path;
+  SetVector<TreeEntry *> SubPath;
+  TreeEntry *Node = Tree->VectorizableTree.front().get();
+  int Cost = Tree->TotalCost;
+
+  // To start we can find just one leaf node that happens to be not the root
+  // node of the graph i.e. with non-zero index. Then, Path is route from the
+  // root node to our leaf node.
+  findLeaf(Node, Path);
+  if (Node == Path.back())
+    return false;
+  do {
+    Node = Path.back();
+    assert(Node->State == TreeEntry::Vectorize && "Incorrect node state");
+    // If we found a branch node i.e. node with more than one non-gathering
+    // child, we could try to find set of profitable nodes in SubPath to
+    // vectorize and if there is no such set of profitable nodes then we could
+    // consider another leaf that is reachable from this branch node.
+    if (Node->isBranch()) {
+      if (cutPath(Cost, SubPath))
+        return true;
+      if (Tree->CostsRecalculations >= MaxCostsRecalculations) {
+        SubPath.clear();
+        break;
+      }
+      TreeEntry *NextFromBranch = nullptr;
+      auto It = llvm::find_if(
+          llvm::reverse(Node->UseEntries), [&Node, &Path](TreeEntry *E) {
+            return (E != Node && E->State == TreeEntry::Vectorize &&
+                    !Path.count(E));
+          });
+      if (It != Node->UseEntries.rend())
+        NextFromBranch = *It;
+      SubPath.clear();
+      if (NextFromBranch && NextFromBranch != Node) {
+        findLeaf(NextFromBranch, Path);
+        Node = Path.back();
+      }
+    } else {
+      // If this node is not a branch node then we could move to another node
+      // below until we reach the root node of the graph or encounter another
+      // branch node.
+      SubPath.insert(Node);
+      Path.pop_back();
+    }
+  } while (Node->Idx);
+
+  // We don't have any branches now and reduce single remaining path now.
+  if (!SubPath.empty()) {
+    if (cutPath(Cost, SubPath))
+      return true;
+  }
+
+#ifndef NDEBUG
+  // Make sure that we have processed all nodes.
+  if (Tree->CostsRecalculations < MaxCostsRecalculations)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *Entry = TEPtr.get();
+      if (Entry->State == TreeEntry::NeedToGather)
+        continue;
+      assert(Entry->State == TreeEntry::ProposedToGather &&
+             "Incorrect node state");
+    }
+#endif
+  return false;
+}
+
+int BoUpSLP::getRawTreeCost() {
+  int CostSum = 0;
+  Tree->BundleWidth = Tree->VectorizableTree.front()->Scalars.size();
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << Tree->VectorizableTree.size() << ".\n");
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -3771,68 +4242,74 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+    if (TE.State == TreeEntry::ProposedToGather)
+      Tree->VecToScalars.insert(TE.Scalars.begin(), TE.Scalars.end());
+    if (TE.State != TreeEntry::Vectorize &&
+        llvm::any_of(llvm::drop_begin(Tree->VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State != TreeEntry::Vectorize &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    if (TE.State == TreeEntry::Vectorize)
+      Tree->ScalarsToVec.insert(TE.Scalars.begin(), TE.Scalars.end());
+
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
+    CostSum += TE.Cost;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State != TreeEntry::Vectorize)
+        continue;
+      int GatherCost = 0;
+      for (TreeEntry *Gather : TE->UseEntries)
+        if (Gather->State != TreeEntry::Vectorize)
+          GatherCost += Gather->Cost;
+      TE->Cost += GatherCost;
+    }
+  return CostSum;
+}
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+int BoUpSLP::getTreeCost() {
+  int CostSum;
+  if (!Tree->IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    Tree->RawTreeCost = CostSum;
+  } else {
+    CostSum = Tree->RawTreeCost;
+  }
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State == TreeEntry::ProposedToGather)
+        CostSum -= TE->Cost;
     }
-  }
 
+  int ExtractCost = getExtractCost();
+  Tree->IsCostSumReady = true;
+  int InsertCost = getInsertCost();
   int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  int Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+  Tree->TotalCost = CostSum + ExtractCost;
 
-  std::string Str;
-  {
-    raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+#ifndef NDEBUG
+  SmallString<256> Str;
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
-
+#endif
   return Cost;
 }
 
@@ -3902,9 +4379,9 @@
   // scheduled, and the last instruction is VL.back(). So we start with
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
-  if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+  if (Tree->BlocksSchedules.count(BB)) {
+    auto *Bundle = Tree->BlocksSchedules[BB]->getScheduleData(
+        E->isOneOf(E->Scalars.back()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -3972,7 +4449,7 @@
               std::distance(E->ReuseShuffleIndices.begin(),
                             llvm::find(E->ReuseShuffleIndices, FoundLane));
         }
-        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+        Tree->ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
       }
     }
   }
@@ -4333,7 +4810,7 @@
       // future.
       Value *PO = LI->getPointerOperand();
       if (getTreeEntry(PO))
-        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+        Tree->ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
       LI = Builder.CreateLoad(VecTy, VecPtr);
@@ -4382,7 +4859,8 @@
       // ExternalUses to make sure that an extract will be generated in the
       // future.
       if (getTreeEntry(ScalarPtr))
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+        Tree->ExternalUses.push_back(
+            ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
@@ -4476,7 +4954,7 @@
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
       if (ScalarArg && getTreeEntry(ScalarArg))
-        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+        Tree->ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses) {
@@ -4571,42 +5049,48 @@
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
-  for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+  for (auto &BSIter : Tree->BlocksSchedules) {
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!Tree->RemovedOperations.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+  auto *VectorRoot = vectorizeTree(Tree->VectorizableTree[0].get());
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-  if (MinBWs.count(ScalarRoot)) {
+  auto *ScalarRoot = Tree->VectorizableTree[0]->Scalars[0];
+  if (Tree->MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
-    auto BundleWidth = VectorizableTree[0]->Scalars.size();
-    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-    auto *VecTy = VectorType::get(MinTy, BundleWidth);
+    Tree->BundleWidth = Tree->VectorizableTree[0]->Scalars.size();
+    auto *MinTy =
+        IntegerType::get(F->getContext(), Tree->MinBWs[ScalarRoot].first);
+    auto *VecTy = VectorType::get(MinTy, Tree->BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
-    VectorizableTree[0]->VectorizedValue = Trunc;
+    Tree->VectorizableTree[0]->VectorizedValue = Trunc;
   }
 
-  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+  LLVM_DEBUG(dbgs() << "SLP: Extracting " << Tree->ExternalUses.size()
                     << " values .\n");
 
   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
   // specified by ScalarType.
   auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
-    if (!MinBWs.count(ScalarRoot))
+    if (!Tree->MinBWs.count(ScalarRoot))
       return Ex;
-    if (MinBWs[ScalarRoot].second)
+    if (Tree->MinBWs[ScalarRoot].second)
       return Builder.CreateSExt(Ex, ScalarType);
     return Builder.CreateZExt(Ex, ScalarType);
   };
 
   // Extract all of the elements with the external uses.
-  for (const auto &ExternalUse : ExternalUses) {
+  for (const auto &ExternalUse : Tree->ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
@@ -4685,7 +5169,7 @@
   }
 
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : Tree->VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -4700,7 +5184,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && Tree->RemovedOperations.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -4717,7 +5203,14 @@
 
   Builder.ClearInsertionPoint();
 
-  return VectorizableTree[0]->VectorizedValue;
+  // Erase all saved trees after vectorization, except current.
+  BuiltTrees.erase(llvm::remove_if(BuiltTrees,
+                                   [&](std::unique_ptr<TreeState> &T) {
+                                     return T.get() != Tree;
+                                   }),
+                   BuiltTrees.end());
+
+  return Tree->VectorizableTree[0]->VectorizedValue;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -5185,6 +5678,31 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (TreeEntry *Entry : Tree->RemovedOperations) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [&](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5385,11 +5903,11 @@
 void BoUpSLP::computeMinimumValueSizes() {
   // If there are no external uses, the expression tree must be rooted by a
   // store. We can't demote in-memory values, so there is nothing to do here.
-  if (ExternalUses.empty())
+  if (Tree->ExternalUses.empty())
     return;
 
   // We only attempt to truncate integer expressions.
-  auto &TreeRoot = VectorizableTree[0]->Scalars;
+  auto &TreeRoot = Tree->VectorizableTree[0]->Scalars;
   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
   if (!TreeRootIT)
     return;
@@ -5401,7 +5919,7 @@
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
-  for (auto &EU : ExternalUses)
+  for (auto &EU : Tree->ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
   if (!Expr.empty())
@@ -5410,7 +5928,7 @@
   // Collect the scalar values of the vectorizable expression. We will use this
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
-  for (auto &EntryPtr : VectorizableTree)
+  for (auto &EntryPtr : Tree->VectorizableTree)
     Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
@@ -5513,7 +6031,7 @@
 
   // Finally, map the values we can demote to the maximum bit with we computed.
   for (auto *Scalar : ToDemote)
-    MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+    Tree->MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
 }
 
 namespace {
@@ -5653,6 +6171,12 @@
                         << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
+
+    // Partially vectorize trees after all full vectorization is done,
+    // otherwise, we could prevent more profitable full vectorization with
+    // smaller vector sizes.
+    if (SLPThrottling)
+      Changed |= R.tryPartialVectorization();
   }
 
   if (Changed) {
@@ -5705,9 +6229,11 @@
                      << "Stores SLP vectorized with cost " << NV("Cost", Cost)
                      << " and with tree size "
                      << NV("TreeSize", R.getTreeSize()));
-
     R.vectorizeTree();
     return true;
+  } else {
+    if (SLPThrottling && R.findSubTree())
+      R.saveTree();
   }
 
   return false;
@@ -5958,6 +6484,9 @@
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else {
+        if (SLPThrottling && R.findSubTree())
+          R.saveTree();
       }
     }
   }
Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -81,20 +81,21 @@
 ; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
 ; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
-; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
-; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
 ; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
-; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[ADD_1]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = shl <2 x i64> [[TMP2]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], <i64 20, i64 20>
 ; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
-; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
-; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
 ; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
 ; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
-; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
-; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[SHR_6]], i32 0
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[SHR_5]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <2 x i64> [[TMP4]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @pr35497(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry: