Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
@@ -118,8 +119,17 @@
                               "number "));
 
 static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
-                   cl::desc("Attempt to vectorize horizontal reductions"));
+    ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                       cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool>
+    SLPThrottling("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<unsigned>
+    MaxCostsRecalculations("slp-throttling-budget", cl::init(32), cl::Hidden,
+                           cl::desc("Limit the total number of nodes for cost "
+                                    "recalculations during throttling"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
@@ -571,7 +581,48 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost() const;
+  int getSpillCost();
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost() const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized operations during throttling.
+  int getInsertCost();
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> scalar form
+  ///      |
+  ///   <bundle 2> scalar form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> scalar form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf
+  bool findSubTree(int UserCost = 0);
+
+  /// Get raw summary of all elements of the tree.
+  int getRawTreeCost();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
@@ -596,6 +647,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    RemovedOperations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -603,6 +656,12 @@
       BS->clear();
     }
     MinBWs.clear();
+    ScalarsToVec.clear();
+    VecToScalars.clear();
+    VecInserts.clear();
+    NoCallInst = true;
+    RawTreeCost = 0;
+    IsCostSumReady = false;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -667,6 +726,9 @@
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Try to cut the tree to make it partially vectorizable.
+  bool cutTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1448,7 +1510,7 @@
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
-    enum EntryState { Vectorize, NeedToGather };
+    enum EntryState { Vectorize, NeedToGather, ProposedToGather };
     EntryState State;
 
     /// Does this sequence require some shuffling?
@@ -1457,6 +1519,9 @@
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
 
+    /// Cost of this tree entry.
+    int Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1469,6 +1534,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// Use of this entry.
+    TinyPtrVector<TreeEntry *> UseEntries;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1599,6 +1667,9 @@
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case ProposedToGather:
+        dbgs() << "ProposedToGather\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -1669,8 +1740,10 @@
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last);
+    }
 
     return Last;
   }
@@ -1706,9 +1779,32 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<TreeEntry *, 2> RemovedOperations;
+
+  /// Tree values proposed to be vectorized.
+  ValueSet ScalarsToVec;
+
+  /// Tree values once considered to be vectorized, but later with throttling
+  /// decided to stay in a scalar form.
+  ValueSet VecToScalars;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// Total cost of inserts in the tree for a particular value.
+  SmallDenseMap<Value*, int> VecInserts;
+
+  /// Raw cost of all elemts in the tree.
+  int RawTreeCost = 0;
+
+  /// Indicate that no CallInst found in the tree and we don't need to calculate
+  /// spill cost.
+  bool NoCallInst = true;
+
+  /// True, if we have calucalte tree cost for the tree.
+  bool IsCostSumReady = false;
+
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -1725,6 +1821,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  int getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1775,6 +1874,12 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// Current operations width to vectorize.
+  unsigned BundleWidth = 0;
+
+  /// Internal tree oprations proposed to be vectorized values use.
+  SmallDenseMap<Value *, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2175,6 +2280,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -2378,7 +2486,7 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -2420,6 +2528,7 @@
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
             continue;
           }
         }
@@ -3127,6 +3236,68 @@
   }
 }
 
+bool BoUpSLP::cutTree() {
+  SmallVector<TreeEntry *, 4> VecNodes;
+
+  // Estimate the subtree not just from a cost perspective, but functional.
+  bool FoundRealOp = false;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize)
+      continue;
+    Instruction *Inst = Entry->getMainOp();
+    if (Inst && (isa<BinaryOperator>(Inst) || isa<FPMathOperator>(Inst) ||
+                 isa<CmpInst>(Inst))) {
+      FoundRealOp = true;
+      break;
+    }
+  }
+  if (!FoundRealOp)
+    return false;
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize)
+      VecNodes.push_back(Entry);
+  }
+  if (VecNodes.size() <= 2)
+    return false;
+  // Canceling unprofitable elements.
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+    if (Entry->State == TreeEntry::ProposedToGather) {
+      Entry->State = TreeEntry::NeedToGather;
+      for (Value *V : Entry->Scalars) {
+        LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                          << " out of proposed to vectorize.\n");
+      }
+    }
+  }
+  // For all canceled operations we should consider the possibility of
+  // use by with non-canceled operations and for that, it requires
+  // to populate ExternalUser list with canceled elements.
+  for (TreeEntry *Entry : VecNodes)
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        if (!VecToScalars.count(U))
+          continue;
+        // Ignore users in the user ignore list.
+        auto *UserInst = cast<Instruction>(U);
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  return true;
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3283,7 +3454,7 @@
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
-  if (E->State == TreeEntry::NeedToGather) {
+  if (E->State != TreeEntry::Vectorize) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
@@ -3712,18 +3883,17 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() const {
+int BoUpSLP::getSpillCost() {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
   // (for example, if spills and fills are required).
-  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
   int Cost = 0;
 
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
@@ -3736,7 +3906,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && ScalarsToVec.count(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -3764,11 +3934,11 @@
            !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
           &*PrevInstIt != PrevInst)
         NumCalls++;
-
       ++PrevInstIt;
     }
 
     if (NumCalls) {
+      NoCallInst = false;
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
         V.push_back(VectorType::get(II->getType(), BundleWidth));
@@ -3781,15 +3951,132 @@
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
+int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
+
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+  Value *ScalarRoot = VectorizableTree.front()->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = VectorType::get(MinTy, BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+int BoUpSLP::getExtractCost() const {
+  int ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  // Consider the possibility of extracting vectorized
+  // values for canceled elements use.
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = InternalTreeUses.find(V);
+      if (It != InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+    }
+  }
+  for (const ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    int Cost = getExtractOperationCost(EU);
+    ExtractCost += Cost;
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getInsertCost() {
+  int InsertCost = 0;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (Entry->State != TreeEntry::ProposedToGather)
+      continue;
+    bool NeedGather = false;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      for (User *Op : Inst->users())
+        if (ScalarsToVec.count(Op)) {
+          NeedGather = true;
+          break;
+        }
+    }
+    if (NeedGather)
+      InsertCost += getEntryCost(Entry);
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::findSubTree(int UserCost) {
+  SmallVector<TreeEntry *, 64> Vec;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize || Entry->Cost <= 0 || !Entry->Idx)
+      continue;
+    Vec.push_back(Entry);
+    if (Vec.size() > MaxCostsRecalculations)
+      break;
+  }
+  llvm::sort(Vec, [&](const TreeEntry *LHS, const TreeEntry *RHS) {
+    return LHS->Cost > RHS->Cost;
+  });
+
+  for (TreeEntry *T : Vec) {
+    T->State = TreeEntry::ProposedToGather;
+    for (Value *V : T->Scalars) {
+      ScalarsToVec.erase(V);
+      VecToScalars.insert(V);
+      ScalarToTreeEntry.erase(V);
+      MustGather.insert(V);
+      ExternalUses.erase(
+          llvm::remove_if(ExternalUses,
+                          [&V](ExternalUser &EU) { return EU.Scalar == V; }),
+          ExternalUses.end());
+    }
+    int PartialCost = getTreeCost() - UserCost;
+    RemovedOperations.push_back(T);
+    if (PartialCost < -SLPCostThreshold && cutTree()) {
+      LLVM_DEBUG(
+          dbgs() << "SLP: Decided to partially vectorize tree with cost: "
+                 << PartialCost << ".\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+int BoUpSLP::getRawTreeCost() {
+  int CostSum = 0;
+  BundleWidth = VectorizableTree.front()->Scalars.size();
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -3803,68 +4090,80 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.State == TreeEntry::NeedToGather &&
-        std::any_of(std::next(VectorizableTree.begin(), I + 1),
-                    VectorizableTree.end(),
-                    [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-                      return EntryPtr->State == TreeEntry::NeedToGather &&
-                             EntryPtr->isSame(TE.Scalars);
-                    }))
+    if (TE.State == TreeEntry::ProposedToGather)
+      VecToScalars.insert(TE.Scalars.begin(), TE.Scalars.end());
+    if (TE.State != TreeEntry::Vectorize &&
+        llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State != TreeEntry::Vectorize &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    if (TE.State == TreeEntry::Vectorize)
+      ScalarsToVec.insert(TE.Scalars.begin(), TE.Scalars.end());
+
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
+    CostSum += TE.Cost;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State != TreeEntry::Vectorize)
+        continue;
+      int GatherCost = 0;
+      for (TreeEntry *Gather : TE->UseEntries)
+        if (Gather->State != TreeEntry::Vectorize)
+          GatherCost += Gather->Cost;
+      TE->Cost += GatherCost;
+    }
+  return CostSum;
+}
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+int BoUpSLP::getTreeCost() {
+  int CostSum;
+  if (!IsCostSumReady) {
+    CostSum = getRawTreeCost();
+    RawTreeCost = CostSum;
+  } else {
+    CostSum = RawTreeCost;
+  }
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State == TreeEntry::ProposedToGather)
+        CostSum -= TE->Cost;
     }
-  }
 
-  int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  int ExtractCost = getExtractCost();
+  int SpillCost = 0;
+  if (!NoCallInst || !IsCostSumReady)
+    SpillCost = getSpillCost();
+#ifndef NDEBUG
+  if (NoCallInst)
+    assert(getSpillCost() == 0 && "Incorrect spill cost");
+#endif
+  if (!IsCostSumReady)
+    IsCostSumReady = true;
+  int InsertCost = getInsertCost();
+  int Cost = CostSum + ExtractCost + SpillCost + InsertCost;
 
-  std::string Str;
-  {
-    raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+#ifndef NDEBUG
+  SmallString<256> Str;
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Insert Cost = " << InsertCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
-
+#endif
   return Cost;
 }
 
@@ -4616,11 +4915,22 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!RemovedOperations.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+  auto *VectorRoot = vectorizeTree(VectorizableTree.front().get());
+
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize && !Entry->VectorizedValue)
+      vectorizeTree(Entry);
+  }
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
@@ -4629,7 +4939,7 @@
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
-    auto BundleWidth = VectorizableTree[0]->Scalars.size();
+    BundleWidth = VectorizableTree.front()->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
     auto *VecTy = VectorType::get(MinTy, BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
@@ -4729,7 +5039,7 @@
   }
 
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
@@ -4744,7 +5054,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && RemovedOperations.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -5229,6 +5541,31 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (TreeEntry *Entry : RemovedOperations) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [&](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5754,6 +6091,9 @@
 
     R.vectorizeTree();
     return true;
+  } else {
+    if (SLPThrottling && R.findSubTree())
+      R.vectorizeTree();
   }
 
   return false;
@@ -6006,6 +6346,9 @@
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else {
+        if (SLPThrottling && R.findSubTree(UserCost))
+          R.vectorizeTree();
       }
     }
   }
@@ -6791,15 +7134,16 @@
       int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       int Cost = TreeCost + ReductionCost;
       if (Cost >= -SLPCostThreshold) {
-          V.getORE()->emit([&]() {
-              return OptimizationRemarkMissed(
-                         SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
-                     << "Vectorizing horizontal reduction is possible"
-                     << "but not beneficial with cost "
-                     << ore::NV("Cost", Cost) << " and threshold "
-                     << ore::NV("Threshold", -SLPCostThreshold);
-          });
+        if (!SLPThrottling || !V.findSubTree(-ReductionCost))
           break;
+        V.getORE()->emit([&]() {
+          return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+                                          cast<Instruction>(VL[0]))
+                 << "Vectorizing horizontal reduction is possible"
+                 << "but not beneficial with cost " << ore::NV("Cost", Cost)
+                 << " and threshold "
+                 << ore::NV("Threshold", -SLPCostThreshold);
+        });
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -60,35 +60,34 @@
 define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
 ; AVX1-LABEL: @powof2div_nonuniform(
 ; AVX1-NEXT:  entry:
-; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
-; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
-; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
-; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
-; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
-; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
-; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
-; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
-; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
-; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
-; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
-; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
-; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 2
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16
 ; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0
+; AVX1-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1
+; AVX1-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2
+; AVX1-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @powof2div_nonuniform(
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry: