Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -93,6 +93,7 @@
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <queue>
 #include <set>
 #include <string>
 #include <tuple>
@@ -125,6 +126,11 @@
     cl::desc(
         "Attempt to vectorize horizontal reductions feeding into a store"));
 
+static cl::opt<unsigned>
+    SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden,
+                      cl::desc("Limit the total number of nodes for cost "
+                               "recalculations during throttling"));
+
 static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -634,11 +640,61 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  InstructionCost getSpillCost() const;
+  InstructionCost getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  InstructionCost getExtractShuffleCost(InstructionCost Cost);
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  InstructionCost getInsertCost(ArrayRef<Value *> VectorizedVals);
+
+  /// Find a non-gathering leaf node from current node C and record the path
+  /// on the way.
+  void findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const;
+
+  using SubTreeQueue =
+    std::priority_queue<std::pair<int, std::vector<TreeEntry *>>>;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> vector form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> gathered nodes
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details:
+  /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf
+  bool findSubTree(SubTreeQueue &SubTrees, InstructionCost TreeCost);
+
+  /// Get raw summary of all elements of the tree.
+  InstructionCost getRawTreeCost(ArrayRef<Value *> VectorizedVals = None);
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
+  InstructionCost getTreeCost(bool TreeReduce,
+                              ArrayRef<Value *> VectorizedVals = None);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -659,6 +715,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    ProposedToGather.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -667,6 +725,10 @@
     }
     MinBWs.clear();
     InstrElementSize.clear();
+    NoCallInst = true;
+    RawTreeCost = 0;
+    ReduceableCost = 0;
+    IsCostSumReady = false;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -829,6 +891,9 @@
   ///       may not be necessary.
   bool isLoadCombineCandidate() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1647,6 +1712,9 @@
     /// Does this entry require reordering?
     SmallVector<unsigned, 4> ReorderIndices;
 
+    /// Cost of this tree entry.
+    InstructionCost Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1659,6 +1727,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1780,6 +1851,13 @@
       return FoundLane;
     }
 
+    // Find nodes with more than one use.
+    bool isFork() const {
+      return llvm::count_if(UseEntries, [this](TreeEntry *Next) {
+               return (Next->Idx != Idx && Next->State != TreeEntry::NeedToGather);
+             }) > 1;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -1804,6 +1882,8 @@
         dbgs() << "NeedToGather\n";
         break;
       }
+      dbgs() << "Cost: ";
+      dbgs() << Cost << "\n";
       dbgs() << "MainOp: ";
       if (MainOp)
         dbgs() << *MainOp << "\n";
@@ -1902,8 +1982,10 @@
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last);
+    }
 
     return Last;
   }
@@ -1953,6 +2035,13 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  InstructionCost
+  getExtractOperationCost(const ExternalUser &EU,
+                          SmallVectorImpl<SmallVector<int>> &ShuffleMask,
+                          SmallVectorImpl<Value *> &FirstUsers,
+                          SmallVectorImpl<APInt> &DemandedElts);
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -2000,6 +2089,27 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallPtrSet<TreeEntry *, 2> ProposedToGather;
+
+  /// Raw cost of all elemts in the tree.
+  InstructionCost RawTreeCost = 0;
+
+  InstructionCost ReduceableCost = 0;
+
+  /// Indicate that no CallInst found in the tree and we don't need to
+  /// calculate spill cost.
+  bool NoCallInst = true;
+
+  /// True, if we have calucalted tree cost for the tree.
+  bool IsCostSumReady = false;
+
+  /// Current operations width to vectorize.
+  unsigned BundleWidth = 0;
+
+  /// Internal tree oprations proposed to be vectorized values use.
+  SmallDenseMap<Value *, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2342,6 +2452,9 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// Make the scheduling region smaller.
+    void reduceSchedulingRegion(Instruction *Start, Instruction *End);
+
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
@@ -2404,6 +2517,9 @@
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
@@ -2537,6 +2653,7 @@
   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
     std::string Str;
     raw_string_ostream OS(Str);
+    OS << "Idx: " << Entry->Idx << "\n";
     if (isSplat(Entry->Scalars)) {
       OS << "<splat> " << *Entry->Scalars[0];
       return Str;
@@ -2607,7 +2724,7 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2642,6 +2759,7 @@
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
+          InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
           if (UseScalar != U ||
               UseEntry->State == TreeEntry::ScatterVectorize ||
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
@@ -3429,6 +3547,49 @@
   }
 }
 
+void BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+    // For all canceled operations we should consider the possibility of
+    // use by with non-canceled operations and for that, it requires
+    // to populate ExternalUser list with canceled elements.
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        TreeEntry *UserTE = getTreeEntry(U);
+        if (!UserTE || ProposedToGather.count(UserTE) == 0)
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  }
+  // Canceling unprofitable elements.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      ScalarToTreeEntry.erase(V);
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+#endif
+    }
+  }
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3747,7 +3908,7 @@
     SmallVector<const TreeEntry *> Entries;
     Optional<TargetTransformInfo::ShuffleKind> Shuffle =
         isGatherShuffledEntry(E, Mask, Entries);
-    if (Shuffle.hasValue()) {
+    if (Shuffle.hasValue() && ProposedToGather.count(E) == 0) {
       InstructionCost GatherCost = 0;
       if (ShuffleVectorInst::isIdentityMask(Mask)) {
         // Perfect match in the graph, will reuse the previously vectorized
@@ -4356,12 +4517,215 @@
   return true;
 }
 
-InstructionCost BoUpSLP::getSpillCost() const {
+InstructionCost
+BoUpSLP::getExtractOperationCost(const ExternalUser &EU,
+                                 SmallVectorImpl<SmallVector<int>> &ShuffleMask,
+                                 SmallVectorImpl<Value *> &FirstUsers,
+                                 SmallVectorImpl<APInt> &DemandedElts) {
+  InstructionCost ExtractCost = 0;
+  SmallVector<unsigned> VF;
+
+  // If found user is an insertelement, do not calculate extract cost but try
+  // to detect it as a final shuffled/identity match.
+  if (EU.User && isa<InsertElementInst>(EU.User)) {
+    if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
+      Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+      if (!InsertIdx || *InsertIdx == UndefMaskElem)
+        return 0;
+      Value *VU = EU.User;
+      auto *It = find_if(FirstUsers, [VU](Value *V) {
+        // Checks if 2 insertelements are from the same buildvector.
+        if (VU->getType() != V->getType())
+          return false;
+        auto *IE1 = cast<InsertElementInst>(VU);
+        auto *IE2 = cast<InsertElementInst>(V);
+        // Go though of insertelement instructions trying to find either VU as
+        // the original vector for IE2 or V as the original vector for IE1.
+        do {
+          if (IE1 == VU || IE2 == V)
+            return true;
+          if (IE1)
+            IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+          if (IE2)
+            IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+        } while (IE1 || IE2);
+        return false;
+      });
+      int VecId = -1;
+      if (It == FirstUsers.end()) {
+        VF.push_back(FTy->getNumElements());
+        ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+        FirstUsers.push_back(EU.User);
+        DemandedElts.push_back(APInt::getZero(VF.back()));
+        VecId = FirstUsers.size() - 1;
+      } else {
+        VecId = std::distance(FirstUsers.begin(), It);
+      }
+      int Idx = *InsertIdx;
+      ShuffleMask[VecId][Idx] = EU.Lane;
+      DemandedElts[VecId].setBit(Idx);
+    }
+  }
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+  auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+  if (MinBWs.count(ScalarRoot)) {
+    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+    auto Extend =
+        MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+    VecTy = FixedVectorType::get(MinTy, BundleWidth);
+    ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                                 VecTy, EU.Lane);
+  } else {
+    ExtractCost +=
+        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+  }
+  return ExtractCost;
+}
+
+InstructionCost BoUpSLP::getExtractShuffleCost(InstructionCost Cost) {
+  InstructionCost ExtractCost = 0;
+  InstructionCost ShuffleCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  SmallVector<SmallVector<int>> ShuffleMask;
+  SmallVector<Value *> FirstUsers;
+  SmallVector<APInt> DemandedElts;
+  for (const ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // Uses by ephemeral values are free (because the ephemeral value will be
+    // removed prior to code generation, and so the extraction will be
+    // removed as well).
+    if (EphValues.count(EU.User))
+      continue;
+
+    // No extract cost for vector "scalar"
+    if (isa<FixedVectorType>(EU.Scalar->getType()))
+      continue;
+
+    // Already counted the cost for external uses when tried to adjust the cost
+    // for extractelements, no need to add it again.
+    if (isa<ExtractElementInst>(EU.Scalar))
+      continue;
+
+    ExtractCost +=
+        getExtractOperationCost(EU, ShuffleMask, FirstUsers, DemandedElts);
+  }
+
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (TreeEntry *Entry : ProposedToGather) {
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = InternalTreeUses.find(V);
+      if (It != InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          if (!ExtractCostCalculated.insert(IU.Scalar).second)
+            ExtractCost += getExtractOperationCost(IU, ShuffleMask, FirstUsers,
+                                                   DemandedElts);
+      }
+    }
+  }
+
+  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+    // For the very first element - simple shuffle of the source vector.
+    int Limit = ShuffleMask[I].size() * 2;
+    if (I == 0 &&
+        all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
+        !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
+      InstructionCost C = TTI->getShuffleCost(
+          TTI::SK_PermuteSingleSrc,
+          cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+                        << " for final shuffle of insertelement external users "
+                        << *VectorizableTree.front()->Scalars.front() << ".\n"
+                        << "SLP: Current total cost = " << Cost << "\n");
+      ShuffleCost += C;
+      continue;
+    }
+    // Other elements - permutation of 2 vectors (the initial one and the next
+    // Ith incoming vector).
+    unsigned VF = ShuffleMask[I].size();
+    for (unsigned Idx = 0; Idx < VF; ++Idx) {
+      int &Mask = ShuffleMask[I][Idx];
+      Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
+    }
+    InstructionCost C = TTI->getShuffleCost(
+        TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
+        ShuffleMask[I]);
+    LLVM_DEBUG(
+        dbgs()
+        << "SLP: Adding cost " << C
+        << " for final shuffle of vector node and external insertelement users "
+        << *VectorizableTree.front()->Scalars.front() << ".\n"
+        << "SLP: Current total cost = " << Cost << "\n");
+    ShuffleCost += C;
+    InstructionCost InsertCost = TTI->getScalarizationOverhead(
+        cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+        /*Insert*/ true,
+        /*Extract*/ false);
+    ShuffleCost -= InsertCost;
+    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+                      << " for insertelements gather.\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
+  }
+
+  return ExtractCost + ShuffleCost;
+}
+
+InstructionCost BoUpSLP::getInsertCost(ArrayRef<Value *> VectorizedVals) {
+  InstructionCost InsertCost = 0;
+  for (TreeEntry *Entry : ProposedToGather) {
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations or nodes that were once
+    // considered to be vectorized but now don't have any direct relations
+    // to vectorizable nodes.
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      if (llvm::any_of(Inst->users(), [this](User *Op) {
+            if (const TreeEntry *UserTE = getTreeEntry(Op)) {
+              return (ProposedToGather.count(UserTE) != 0);
+            }
+            return false;
+          })) {
+        InsertCost += getEntryCost(Entry, VectorizedVals);
+        break;
+      }
+    }
+  }
+  return InsertCost;
+}
+
+void BoUpSLP::findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const {
+  if (!Path.count(C))
+    Path.insert(C);
+  int NonGatherUse;
+  do {
+    NonGatherUse = 0;
+    for (TreeEntry *Next : llvm::reverse(C->UseEntries)) {
+      // Ignore any processed nodes to avoid cycles.
+      if (Next->State == TreeEntry::NeedToGather || Path.count(Next) ||
+          Next == C)
+        continue;
+      C = Next;
+      Path.insert(C);
+      NonGatherUse++;
+      break;
+    }
+  } while (NonGatherUse != 0);
+}
+
+InstructionCost BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   InstructionCost Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
@@ -4434,6 +4798,7 @@
     }
 
     if (NumCalls) {
+      NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues) {
         auto *ScalarTy = II->getType();
@@ -4450,155 +4815,105 @@
   return Cost;
 }
 
-InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
-  InstructionCost Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+bool BoUpSLP::findSubTree(SubTreeQueue &SubTrees, InstructionCost TreeCost) {
+  SetVector<TreeEntry *> Path;
+  std::vector<TreeEntry *> SubPath;
+  SetVector<TreeEntry *> Visited;
+  TreeEntry *Node = VectorizableTree.front().get();
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+  // Avoid reducing the tree if there is no potential room to reduce.
+  if ((TreeCost - ReduceableCost) >= -SLPCostThreshold)
+    return false;
 
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+  // To start we can find just one leaf node that happens to be not the root
+  // node of the graph i.e. with non-zero index. Then, Path is route from the
+  // root node to our leaf node.
+  findLeaf(Node, Path);
+  if (Node == Path.back())
+    return false;
+  do {
+    Node = Path.back();
+    Visited.insert(Node);
+    assert(Node->State != TreeEntry::NeedToGather && "Incorrect node state");
+    // If we found a branch node i.e. node with more than one non-gathering
+    // child, we could try to find set of profitable nodes in SubPath to
+    // vectorize and if there is no such set of profitable nodes then we could
+    // consider another leaf that is reachable from this branch node.
+    if (Node->isFork()) {
+      TreeEntry *NextFromFork = nullptr;
+      auto It = llvm::find_if(
+          llvm::reverse(Node->UseEntries), [&Node, &Visited](TreeEntry *E) {
+            return (E != Node && E->State != TreeEntry::NeedToGather &&
+                    !Visited.count(E));
+          });
+      if (It != Node->UseEntries.rend())
+        NextFromFork = *It;
+      else
+        Path.pop_back();
+
+      InstructionCost SubTreeCost = 0;
+      for (TreeEntry *Entry : SubPath)
+        SubTreeCost += Entry->Cost;
+      if (SubTreeCost.isValid() && SubTreeCost > 0)
+        SubTrees.push(std::make_pair(*SubTreeCost.getValue(), SubPath));
+      SubPath.clear();
+      if (NextFromFork && NextFromFork != Node) {
+        findLeaf(NextFromFork, Path);
+        Node = Path.back();
+      }
+    } else {
+      // If this node is not a branch node then we could move to another node
+      // below until we reach the root node of the graph or encounter another
+      // branch node.
+      SubPath.push_back(Node);
+      Path.pop_back();
+    }
+  } while (Node->Idx);
 
-    InstructionCost C = getEntryCost(&TE, VectorizedVals);
-    Cost += C;
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                      << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
-  }
+  return (SubTrees.size() > 0);
+}
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  InstructionCost ExtractCost = 0;
-  SmallVector<unsigned> VF;
-  SmallVector<SmallVector<int>> ShuffleMask;
-  SmallVector<Value *> FirstUsers;
-  SmallVector<APInt> DemandedElts;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+InstructionCost BoUpSLP::getRawTreeCost(ArrayRef<Value *> VectorizedVals) {
+  InstructionCost CostSum = 0;
+  BundleWidth = VectorizableTree.front()->Scalars.size();
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << VectorizableTree.size() << ".\n");
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
-    // No extract cost for vector "scalar"
-    if (isa<FixedVectorType>(EU.Scalar->getType()))
-      continue;
+    TE.Cost = getEntryCost(&TE, VectorizedVals);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
+                      << " for bundle that starts with " << *TE.Scalars[0]
+                      << ".\n");
+    CostSum += TE.Cost;
+    LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n");
 
-    // Already counted the cost for external uses when tried to adjust the cost
-    // for extractelements, no need to add it again.
-    if (isa<ExtractElementInst>(EU.Scalar))
+    if (TE.State == TreeEntry::NeedToGather)
       continue;
 
-    // If found user is an insertelement, do not calculate extract cost but try
-    // to detect it as a final shuffled/identity match.
-    if (EU.User && isa<InsertElementInst>(EU.User)) {
-      if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
-        Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
-        if (!InsertIdx || *InsertIdx == UndefMaskElem)
-          continue;
-        Value *VU = EU.User;
-        auto *It = find_if(FirstUsers, [VU](Value *V) {
-          // Checks if 2 insertelements are from the same buildvector.
-          if (VU->getType() != V->getType())
-            return false;
-          auto *IE1 = cast<InsertElementInst>(VU);
-          auto *IE2 = cast<InsertElementInst>(V);
-          // Go though of insertelement instructions trying to find either VU as
-          // the original vector for IE2 or V as the original vector for IE1.
-          do {
-            if (IE1 == VU || IE2 == V)
-              return true;
-            if (IE1)
-              IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
-            if (IE2)
-              IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
-          } while (IE1 || IE2);
-          return false;
-        });
-        int VecId = -1;
-        if (It == FirstUsers.end()) {
-          VF.push_back(FTy->getNumElements());
-          ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
-          FirstUsers.push_back(EU.User);
-          DemandedElts.push_back(APInt::getZero(VF.back()));
-          VecId = FirstUsers.size() - 1;
-        } else {
-          VecId = std::distance(FirstUsers.begin(), It);
-        }
-        int Idx = *InsertIdx;
-        ShuffleMask[VecId][Idx] = EU.Lane;
-        DemandedElts[VecId].setBit(Idx);
-      }
-    }
-
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = FixedVectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
-    }
+    if (TE.Idx && TE.Cost > 0)
+      ReduceableCost += TE.Cost;
   }
+  return CostSum;
+}
 
-  InstructionCost SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
-  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
-    // For the very first element - simple shuffle of the source vector.
-    int Limit = ShuffleMask[I].size() * 2;
-    if (I == 0 &&
-        all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
-        !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
-      InstructionCost C = TTI->getShuffleCost(
-          TTI::SK_PermuteSingleSrc,
-          cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
-      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                        << " for final shuffle of insertelement external users "
-                        << *VectorizableTree.front()->Scalars.front() << ".\n"
-                        << "SLP: Current total cost = " << Cost << "\n");
-      Cost += C;
-      continue;
-    }
-    // Other elements - permutation of 2 vectors (the initial one and the next
-    // Ith incoming vector).
-    unsigned VF = ShuffleMask[I].size();
-    for (unsigned Idx = 0; Idx < VF; ++Idx) {
-      int &Mask = ShuffleMask[I][Idx];
-      Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
-    }
-    InstructionCost C = TTI->getShuffleCost(
-        TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
-        ShuffleMask[I]);
-    LLVM_DEBUG(
-        dbgs()
-        << "SLP: Adding cost " << C
-        << " for final shuffle of vector node and external insertelement users "
-        << *VectorizableTree.front()->Scalars.front() << ".\n"
-        << "SLP: Current total cost = " << Cost << "\n");
-    Cost += C;
-    InstructionCost InsertCost = TTI->getScalarizationOverhead(
-        cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
-        /*Insert*/ true,
-        /*Extract*/ false);
-    Cost -= InsertCost;
-    LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
-                      << " for insertelements gather.\n"
-                      << "SLP: Current total cost = " << Cost << "\n");
+InstructionCost BoUpSLP::getTreeCost(bool TreeReduce,
+                                     ArrayRef<Value *> VectorizedVals) {
+  InstructionCost CostSum;
+  if (!IsCostSumReady) {
+    CostSum = getRawTreeCost(VectorizedVals);
+    RawTreeCost = CostSum;
+  } else {
+    CostSum = RawTreeCost;
   }
 
+  InstructionCost ExtractCost = getExtractShuffleCost(CostSum);
+  InstructionCost SpillCost = getSpillCost();
+  InstructionCost InsertCost = getInsertCost(VectorizedVals);
+  InstructionCost Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+  InstructionCost FullCost = Cost;
+
 #ifndef NDEBUG
   SmallString<256> Str;
   {
@@ -4612,7 +4927,77 @@
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
 
-  return Cost;
+  if (TreeReduce && Cost >= -SLPCostThreshold) {
+    std::vector<TreeEntry *> Vec;
+    SubTreeQueue SubTrees;
+    if (!findSubTree(SubTrees, Cost))
+      return Cost;
+    unsigned NodeCounter = 0;
+
+    for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+      TreeEntry *Entry = TEPtr.get();
+      Entry->dump();
+    }
+
+    while (!SubTrees.empty()) {
+      std::pair<int, std::vector<TreeEntry *>> SubTree = SubTrees.top();
+      SubTrees.pop();
+      NodeCounter++;
+
+      bool NoConnectToVecOp = true;
+      for (TreeEntry *T : SubTree.second) {
+        ProposedToGather.insert(T);
+        T->State = TreeEntry::NeedToGather;
+        // Zero cost on any gather operations with no direct
+        // connection to vector nodes. At this point those nodes stop
+        // being gather nodes
+        for (TreeEntry *U : T->UseEntries) {
+          for (const EdgeInfo &EI : U->UserTreeIndices) {
+            if (EI.UserTE->State != TreeEntry::NeedToGather) {
+              NoConnectToVecOp = false;
+              break;
+            }
+          }
+          if (!NoConnectToVecOp)
+            break;
+        }
+        if (NoConnectToVecOp)
+          for (TreeEntry *U : T->UseEntries) {
+            CostSum -= U->Cost;
+            U->Cost = 0;
+          }
+        CostSum -= T->Cost;
+        T->Cost = getEntryCost(T, VectorizedVals);
+        CostSum += T->Cost;
+
+        for (Value *V : T->Scalars) {
+          MustGather.insert(V);
+          ExternalUses.erase(
+              llvm::remove_if(ExternalUses,
+                              [V](ExternalUser &EU) { return EU.Scalar == V; }),
+              ExternalUses.end());
+        }
+        ExtractCost = getExtractShuffleCost(CostSum);
+        if (!NoCallInst)
+          SpillCost = getSpillCost();
+        assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost");
+        InstructionCost InsertCost = getInsertCost(VectorizedVals);
+        Cost = CostSum + ExtractCost + SpillCost + InsertCost;
+        for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+          TreeEntry *Entry = TEPtr.get();
+          Entry->dump();
+        }
+        if (Cost < -SLPCostThreshold && !isTreeTinyAndNotFullyVectorizable() &&
+            (VectorizableTree[0]->State != TreeEntry::NeedToGather &&
+             VectorizableTree[1]->State != TreeEntry::NeedToGather)) {
+          cutTree();
+          return Cost;
+        }
+      }
+    }
+    ProposedToGather.clear();
+  }
+  return FullCost;
 }
 
 Optional<TargetTransformInfo::ShuffleKind>
@@ -5666,12 +6051,24 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!ProposedToGather.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
   auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
 
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if ((Entry->State != TreeEntry::NeedToGather) &&
+        !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
+
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
@@ -5813,7 +6210,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && ProposedToGather.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -6042,6 +6441,7 @@
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
+    BundleMember->TE = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
     if (BundleMember->UnscheduledDepsInBundle == 0) {
       ReadyInsts.insert(BundleMember);
@@ -6311,6 +6711,85 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start,
+                                                      Instruction *End) {
+  if (Start)
+    ScheduleStart = Start;
+  if (End)
+    ScheduleEnd = End;
+}
+
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  SmallPtrSet<Instruction *, 12> Gathers;
+  SmallPtrSet<Instruction *, 12> Reduced;
+  Instruction *Start = nullptr;
+
+  // We can reduce the number of instructions to be considered for scheduling,
+  // after cutting the tree. Here we shrink the scheduling area from the top,
+  // consecutively, untill we encounter the required instruction. There might be
+  // unnecessary NeedToGather nodes with the relationship only to other
+  // NeedToGather nodes and unmap instructions in chains, we could safely
+  // delete those.
+  for (std::unique_ptr<TreeEntry> &TEPtr : reverse(VectorizableTree)) {
+    TreeEntry *TE = TEPtr.get();
+    if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() ||
+        TE->getMainOp()->getParent() != BS->BB)
+      continue;
+    for (const EdgeInfo &EI : TE->UserTreeIndices) {
+      if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) {
+        auto InstructionsOnly =
+            make_filter_range(TE->Scalars, Instruction::classof);
+        for (Value *V : InstructionsOnly)
+          Gathers.insert(cast<Instruction>(V));
+        break;
+      }
+    }
+  }
+
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (!getTreeEntry(I) && !Gathers.count(I)) {
+      Reduced.insert(I);
+    } else {
+      Start = I;
+      break;
+    }
+  }
+
+  BS->reduceSchedulingRegion(Start, nullptr);
+
+  for (TreeEntry *Entry : ProposedToGather) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      SD->IsScheduled = false;
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+
+  if (Reduced.size()) {
+    for (Instruction *I : Reduced) {
+      ScheduleData *SD = BS->getScheduleData(I);
+      if (SD)
+        SD->SchedulingRegionID = -1;
+    }
+  }
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -6885,7 +7364,7 @@
 
   R.computeMinimumValueSizes();
 
-  InstructionCost Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost(true);
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
@@ -7091,6 +7570,7 @@
   // Check that all of the parts are instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL);
+
   if (!S.getOpcode())
     return false;
 
@@ -7182,7 +7662,7 @@
         continue;
 
       R.computeMinimumValueSizes();
-      InstructionCost Cost = R.getTreeCost();
+      InstructionCost Cost = R.getTreeCost(true);
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
@@ -7852,7 +8332,7 @@
 
       // Estimate cost.
       InstructionCost TreeCost =
-          V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
+          V.getTreeCost(false, makeArrayRef(&ReducedVals[i], ReduxWidth));
       InstructionCost ReductionCost =
           getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
       InstructionCost Cost = TreeCost + ReductionCost;
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry: