Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
@@ -117,8 +118,17 @@
                               "number "));
 
 static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
-                   cl::desc("Attempt to vectorize horizontal reductions"));
+    ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                       cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool>
+    SLPThrottling("slp-throttle", cl::init(true), cl::Hidden,
+                  cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<int>
+    MaxCostsRecalculations("slp-throttling-budget", cl::init(128), cl::Hidden,
+                           cl::desc("Limit the total number of nodes for cost "
+                                    "recalculations during throttling"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
@@ -556,9 +566,55 @@
   /// holding live values over call sites.
   int getSpillCost() const;
 
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost() const;
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized  operations during throttling.
+  int getInsertCost() const;
+
+  /// Cut given path until it might be good to vectorize.
+  bool cutPath(int &Cost, ArrayRef<TreeEntry *> Path);
+
+  /// Find a non-gathering leaf node from current node C and record the path
+  /// on the way.
+  TreeEntry *findLeaf(TreeEntry *C, SmallVectorImpl<TreeEntry *> &Path) const;
+
+  /// Find a subtree of the whole tree suitable to be vectorized. When
+  /// vectorizing the whole tree is not profitable, we can consider vectorizing
+  /// part of that tree. SLP algorithm looks to operations to vectorize starting
+  /// from seed instructions on the bottom toward the end of chains of
+  /// dependencies to the top of SLP graph, it groups potentially vectorizable
+  /// operations in scalar form to bundles.
+  /// For example:
+  ///
+  ///   <bundle 1> scalar form
+  ///      |
+  ///   <bundle 2> scalar form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> scalar form
+  ///
+  /// Total cost is not profitable to vectorize, hence all operations are in
+  /// scalar form.
+  ///
+  /// Here is the same tree after SLP throttling transformation:
+  ///
+  ///   <bundle 1> vector form
+  ///      |
+  ///   <bundle 2> vector form  <bundle 3> scalar form
+  ///       \                    /
+  ///        <seed root bundle> vector form
+  ///
+  /// So, we can throttle some operations in such a way that it is still
+  /// profitable to vectorize part on the tree, while all tree vectorization
+  /// does not make sense.
+  /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf
+  Optional<int> findSubTree(int Cost);
+
   /// \returns the vectorization cost of the subtree that starts at \p VL.
-  /// A negative number means that this is profitable.
-  int getTreeCost();
+  /// A negative number means that this is profitable. Also, the raw summary of
+  /// the subtree without extract cost, spill cost and etc.
+  std::pair<int, int> getTreeCost();
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -579,6 +635,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    RemovedOperations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -586,6 +644,9 @@
       BS->clear();
     }
     MinBWs.clear();
+    ScalarsToVec.clear();
+    VecToScalars.clear();
+    CostsRecalculations = 0;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -641,6 +702,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Estimate the subtree not just from a cost perspective, but functional.
+  bool isGoodSubTreeToVectorize() const;
+
   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
   /// the IR optimizer, so we do not want to alter the pattern. For example,
@@ -650,6 +714,9 @@
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Try to cut the tree to make it partially vectorizable.
+  bool cutTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1432,7 +1499,8 @@
     Value *VectorizedValue = nullptr;
 
     /// Do we need to gather this sequence ?
-    bool NeedToGather = false;
+    enum EntryState { Vectorize, NeedToGather, ProposedToGather };
+    EntryState State;
 
     /// Does this sequence require some shuffling?
     SmallVector<unsigned, 4> ReuseShuffleIndices;
@@ -1440,6 +1508,9 @@
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
 
+    /// Cost of this tree entry.
+    int Cost = 0;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -1452,6 +1523,9 @@
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<EdgeInfo, 1> UserTreeIndices;
 
+    /// The index containing the use of this entry.
+    TinyPtrVector<TreeEntry *> UseTreeIndices;
+
     /// The index of this treeEntry in VectorizableTree.
     int Idx = -1;
 
@@ -1562,6 +1636,13 @@
       return true;
     }
 
+    // Find nodes with more than one use.
+    bool isBranch() const {
+      return llvm::count_if(UseTreeIndices, [this](TreeEntry *Next) {
+               return (Next->Idx != Idx && Next->State == TreeEntry::Vectorize);
+             }) > 1;
+    }
+
 #ifndef NDEBUG
     /// Debug printer.
     LLVM_DUMP_METHOD void dump() const {
@@ -1574,7 +1655,18 @@
       dbgs() << "Scalars: \n";
       for (Value *V : Scalars)
         dbgs().indent(2) << *V << "\n";
-      dbgs() << "NeedToGather: " << NeedToGather << "\n";
+      dbgs() << "State: ";
+      switch (State) {
+      case Vectorize:
+        dbgs() << "Vectorize\n";
+        break;
+      case NeedToGather:
+        dbgs() << "NeedToGather\n";
+        break;
+      case ProposedToGather:
+        dbgs() << "ProposedToGather\n";
+        break;
+      }
       dbgs() << "MainOp: ";
       if (MainOp)
         dbgs() << *MainOp << "\n";
@@ -1594,12 +1686,12 @@
       if (ReuseShuffleIndices.empty())
         dbgs() << "Emtpy";
       else
-        for (unsigned ReuseIdx : ReuseShuffleIndices)
-          dbgs() << ReuseIdx << ", ";
+        for (unsigned Idx : ReuseShuffleIndices)
+          dbgs() << Idx << ", ";
       dbgs() << "\n";
       dbgs() << "ReorderIndices: ";
-      for (unsigned ReorderIdx : ReorderIndices)
-        dbgs() << ReorderIdx << ", ";
+      for (unsigned Idx : ReorderIndices)
+        dbgs() << Idx << ", ";
       dbgs() << "\n";
       dbgs() << "UserTreeIndices: ";
       for (const auto &EInfo : UserTreeIndices)
@@ -1620,7 +1712,7 @@
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
-    Last->NeedToGather = !Vectorized;
+    Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
@@ -1644,8 +1736,10 @@
       MustGather.insert(VL.begin(), VL.end());
     }
 
-    if (UserTreeIdx.UserTE)
+    if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
+      VectorizableTree[UserTreeIdx.UserTE->Idx]->UseTreeIndices.push_back(Last);
+    }
 
     return Last;
   }
@@ -1681,6 +1775,16 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<TreeEntry *, 2> RemovedOperations;
+
+  /// Tree values proposed to be vectorized.
+  ValueSet ScalarsToVec;
+
+  /// Tree values once considered to be vectorized, but later with throttling
+  /// decided to stay in a scalar form.
+  ValueSet VecToScalars;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -1700,6 +1804,9 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  /// \returns the cost of extracting the vectorized elements.
+  int getExtractOperationCost(const ExternalUser &EU) const;
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1750,6 +1857,13 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// Number of times in nodes that we already recalulated cost of
+  /// the subtree during throtteling.
+  int CostsRecalculations = 0;
+
+  /// Internal tree oprations proposed to be vectorized values use.
+  SmallDenseMap<Value *, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -2136,6 +2250,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -2291,7 +2408,7 @@
 
   static std::string getNodeAttributes(const TreeEntry *Entry,
                                        const BoUpSLP *) {
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       return "color=red";
     return "";
   }
@@ -2339,11 +2456,11 @@
   buildTree_rec(Roots, 0, EdgeInfo());
 
   // Collect the values that we need to extract from the tree.
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       continue;
 
     // For each lane:
@@ -2380,7 +2497,8 @@
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
-            assert(!UseEntry->NeedToGather && "Bad state");
+            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+            InternalTreeUses[U].emplace_back(Scalar, U, FoundLane);
             continue;
           }
         }
@@ -3088,6 +3206,64 @@
   }
 }
 
+bool BoUpSLP::cutTree() {
+  SmallPtrSet<Value *, 4> Removed;
+  SmallVector<TreeEntry *, 4> VecNodes;
+  if (!isGoodSubTreeToVectorize())
+    return false;
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::Vectorize)
+      VecNodes.push_back(Entry);
+  }
+  if (VecNodes.size() <= 2)
+    return false;
+  // Canceling unprofitable elements.
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State == TreeEntry::NeedToGather)
+      continue;
+    if (Entry->State == TreeEntry::ProposedToGather) {
+      Entry->State = TreeEntry::NeedToGather;
+      for (Value *V : Entry->Scalars) {
+        LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                          << " out of proposed to vectorize.\n");
+        ScalarToTreeEntry.erase(V);
+        Removed.insert(V);
+        RemovedOperations.push_back(Entry);
+        MustGather.insert(V);
+        ExternalUses.erase(
+            llvm::remove_if(ExternalUses,
+                           [&V](ExternalUser &EU) { return EU.Scalar == V; }),
+            ExternalUses.end());
+      }
+    }
+  }
+  // For all canceled operations we should consider the possibility of
+  // use by with non-canceled operations and for that, it requires
+  // to populate ExternalUser list with canceled elements.
+  for (TreeEntry *Entry : VecNodes)
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        auto *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+        if (!Removed.count(U))
+          continue;
+        // Ignore users in the user ignore list.
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, Lane);
+      }
+    }
+  return true;
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
@@ -3212,7 +3388,7 @@
     ReuseShuffleCost =
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
-  if (E->NeedToGather) {
+  if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
       return 0;
     if (isSplat(VL)) {
@@ -3280,7 +3456,7 @@
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         int DeadCost = ReuseShuffleCost;
         if (!E->ReorderIndices.empty()) {
           // TODO: Merge this shuffle with the ReuseShuffleCost.
@@ -3566,20 +3742,22 @@
                     << VectorizableTree.size() << " is fully vectorizable .\n");
 
   // We only handle trees of heights 1 and 2.
-  if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)
+  if (VectorizableTree.size() == 1 &&
+      VectorizableTree[0]->State == TreeEntry::Vectorize)
     return true;
 
   if (VectorizableTree.size() != 2)
     return false;
 
   // Handle splat and all-constants stores.
-  if (!VectorizableTree[0]->NeedToGather &&
+  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
       (allConstant(VectorizableTree[1]->Scalars) ||
        isSplat(VectorizableTree[1]->Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)
+  if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+      VectorizableTree[1]->State == TreeEntry::NeedToGather)
     return false;
 
   return true;
@@ -3642,6 +3820,19 @@
   return true;
 }
 
+bool BoUpSLP::isGoodSubTreeToVectorize() const {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (Entry->State != TreeEntry::Vectorize)
+      continue;
+    Instruction *Inst = Entry->getMainOp();
+    if (Inst && (isa<BinaryOperator>(Inst) || isa<FPMathOperator>(Inst) ||
+                 isa<CmpInst>(Inst)))
+      return true;
+  }
+  return false;
+}
+
 int BoUpSLP::getSpillCost() const {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
@@ -3653,7 +3844,7 @@
   SmallPtrSet<Instruction*, 4> LiveValues;
   Instruction *PrevInst = nullptr;
 
-  for (const auto &TEPtr : VectorizableTree) {
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
     if (!Inst)
       continue;
@@ -3666,7 +3857,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && ScalarsToVec.count(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -3711,15 +3902,201 @@
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
+int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const {
+  unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
+
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
+
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+  Value *ScalarRoot = VectorizableTree[0]->Scalars[0];
+
+  auto It = MinBWs.find(ScalarRoot);
+  if (It != MinBWs.end()) {
+    uint64_t Width = It->second.first;
+    bool Signed = It->second.second;
+    auto *MinTy = IntegerType::get(F->getContext(), Width);
+    unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt;
+    VecTy = VectorType::get(MinTy, BundleWidth);
+    return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy,
+                                          EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+int BoUpSLP::getExtractCost() const {
+  int ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  for (const ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // Avoid non-vectorized scalars.
+    if (!ScalarsToVec.count(EU.Scalar)) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      auto It = InternalTreeUses.find(EU.Scalar);
+      if (It != InternalTreeUses.end()) {
+        const UserList &UL = It->second;
+        for (const ExternalUser &IU : UL)
+          ExtractCost += getExtractOperationCost(IU);
+      }
+      continue;
+    }
+    ExtractCost += getExtractOperationCost(EU);
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getInsertCost() const {
+  int InsertCost = 0;
+  for (const std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    // Avoid already vectorized TreeEntries, it is already in a vector form and
+    // we don't need to gather those operations.
+    if (Entry->State == TreeEntry::NeedToGather || Entry->VectorizedValue)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      for (Use &U : Inst->operands()) {
+        Value *Op = U.get();
+        if (VecToScalars.count(Op))
+          InsertCost += getGatherCost(Op);
+      }
+    }
+  }
+  return InsertCost;
+}
+
+bool BoUpSLP::cutPath(int &Cost, ArrayRef<TreeEntry *> Path) {
+  // Decrement nodes one by one until Path is empty or we find a suitable set
+  // of nodes for partial tree vectorization
+  for (TreeEntry *N : Path) {
+    CostsRecalculations++;
+
+    // Stop if we are over our budget of maximum cost calculations.
+    if (CostsRecalculations >= MaxCostsRecalculations)
+      break;
+
+    // We are no longer propose to vectorize this node and we substitute
+    // cost of this node from the cost of all vectorizable nodes.
+    assert(N->State == TreeEntry::Vectorize &&
+           "Incorrect node state, visiting twice.");
+    N->State = TreeEntry::ProposedToGather;
+    Cost -= N->Cost;
+    for (Value *V : N->Scalars) {
+      ScalarsToVec.erase(V);
+      VecToScalars.insert(V);
+    }
+    int PartialCost = Cost;
+    PartialCost += getExtractCost() + getSpillCost() + getInsertCost();
+    if (PartialCost < -SLPCostThreshold && cutTree())
+      return true;
+  }
+  return false;
+}
+
+BoUpSLP::TreeEntry *
+BoUpSLP::findLeaf(TreeEntry *C, SmallVectorImpl<TreeEntry *> &Path) const {
+  int NonGatherUse;
+  if (!is_contained(Path, C))
+    Path.push_back(C);
+  do {
+    NonGatherUse = 0;
+    for (TreeEntry *Next : llvm::reverse(C->UseTreeIndices)) {
+      if (Next->State == TreeEntry::NeedToGather)
+        continue;
+      // Ignore any processed nodes to avoid cycles.
+      if (Next->State == TreeEntry::ProposedToGather ||
+          is_contained(Path, Next) || Next == C)
+        continue;
+      C = Next;
+      Path.push_back(C);
+      NonGatherUse++;
+      break;
+    }
+  } while (NonGatherUse != 0);
+  return C;
+}
+
+Optional<int> BoUpSLP::findSubTree(int Cost) {
+  SmallVector<TreeEntry *, 2> Path;
+  SmallVector<TreeEntry *, 2> SubPath;
+  TreeEntry *Node = nullptr;
+
+  // To start we can find just one leaf node that happens to be not the root
+  // node of the graph i.e. with non-zero index. Then, Path is route from the
+  // root node to our leaf node.
+  if (!findLeaf(VectorizableTree[0].get(), Path)->Idx)
+    return None;
+  do {
+    Node = Path.back();
+    assert(Node->State == TreeEntry::Vectorize && "Incorrect node state");
+    // If we found a branch node i.e. node with more than one non-gathering
+    // child, we could try to find set of profitable nodes in SubPath to
+    // vectorize and if there is no such set of profitable nodes then we could
+    // consider another leaf that is reachable from this branch node.
+    if (Node->isBranch()) {
+      if (cutPath(Cost, SubPath))
+        return Cost;
+      if (CostsRecalculations >= MaxCostsRecalculations) {
+        SubPath.clear();
+        break;
+      }
+      TreeEntry *NextFromBranch = nullptr;
+      auto It = llvm::find_if(
+          llvm::reverse(Node->UseTreeIndices), [&Node, &Path](TreeEntry *E) {
+            return (E != Node && E->State == TreeEntry::Vectorize &&
+                    !is_contained(Path, E));
+          });
+      if (It != Node->UseTreeIndices.rend())
+        NextFromBranch = *It;
+      SubPath.clear();
+      if (NextFromBranch && NextFromBranch != Node)
+        Node = findLeaf(NextFromBranch, Path);
+    } else {
+      // If this node is not a branch node then we could move to another node
+      // below until we reach the root node of the graph or encounter another
+      // branch node.
+      SubPath.push_back(Node);
+      Path.pop_back();
+    }
+  } while (Node->Idx);
+
+  // We don't have any branches now and reduce single remaining path now.
+  if (!SubPath.empty()) {
+    if (cutPath(Cost, SubPath))
+      return Cost;
+  }
+
+#ifndef NDEBUG
+  // Make sure that we have processed all nodes.
+  if (CostsRecalculations < MaxCostsRecalculations)
+    for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+      TreeEntry *Entry = TEPtr.get();
+      if (Entry->State == TreeEntry::NeedToGather)
+        continue;
+      assert(Entry->State == TreeEntry::ProposedToGather &&
+             "Incorrect node state");
+    }
+#endif
+  return None;
+}
+
+std::pair<int, int> BoUpSLP::getTreeCost() {
+  int CostSum = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I].get();
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+    TreeEntry &TE = *TEPtr.get();
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -3733,68 +4110,50 @@
     // their uses. Since such an approach results in fewer total entries,
     // existing heuristics based on tree size may yield different results.
     //
-    if (TE.NeedToGather &&
-        std::any_of(
-            std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),
-            [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
-              return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);
-            }))
+    if (TE.State == TreeEntry::NeedToGather &&
+        llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1),
+                     [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+                       return EntryPtr->State == TreeEntry::NeedToGather &&
+                              EntryPtr->isSame(TE.Scalars);
+                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    if (TE.State == TreeEntry::Vectorize)
+      ScalarsToVec.insert(TE.Scalars.begin(), TE.Scalars.end());
+
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
+    CostSum += TE.Cost;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
-  int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
-
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
-
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+  if (SLPThrottling)
+    for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
+      TreeEntry *TE = TEPtr.get();
+      if (TE->State == TreeEntry::NeedToGather)
+        continue;
+      int GatherCost = 0;
+      for (TreeEntry *Gather : TE->UseTreeIndices)
+        if (Gather->State == TreeEntry::NeedToGather)
+          GatherCost += Gather->Cost;
+      TE->Cost = TE->Cost + GatherCost;
     }
-  }
 
+  int ExtractCost = getExtractCost();
   int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  int Cost = CostSum + ExtractCost + SpillCost;
 
-  std::string Str;
-  {
-    raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
-  }
+  SmallString<256> Str;
+  raw_svector_ostream OS(Str);
+  OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+     << "SLP: Extract Cost = " << ExtractCost << ".\n"
+     << "SLP: Total Cost = " << Cost << ".\n";
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 
-  return Cost;
+  return std::make_pair(Cost, CostSum);
 }
 
 int BoUpSLP::getGatherCost(Type *Ty,
@@ -4029,7 +4388,7 @@
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
-  if (E->NeedToGather) {
+  if (E->State == TreeEntry::NeedToGather) {
     setInsertPointAfterBundle(E);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
@@ -4084,7 +4443,7 @@
     }
 
     case Instruction::ExtractElement: {
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         Value *V = E->getSingleOperand(0);
         if (!E->ReorderIndices.empty()) {
           OrdersType Mask;
@@ -4117,7 +4476,7 @@
       return V;
     }
     case Instruction::ExtractValue: {
-      if (!E->NeedToGather) {
+      if (E->State == TreeEntry::Vectorize) {
         LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
         Builder.SetInsertPoint(LI);
         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
@@ -4559,7 +4918,12 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!RemovedOperations.empty())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -4603,7 +4967,7 @@
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert(!E->NeedToGather && "Extracting from a gather list");
+    assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
@@ -4672,11 +5036,11 @@
   }
 
   // For each vectorized value:
-  for (auto &TEPtr : VectorizableTree) {
+  for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->NeedToGather)
+    if (Entry->State == TreeEntry::NeedToGather)
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -4687,7 +5051,9 @@
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
-      if (!Ty->isVoidTy()) {
+      // The tree might not be fully vectorized, so we don't have to
+      // check every user.
+      if (!Ty->isVoidTy() && RemovedOperations.empty()) {
         for (User *U : Scalar->users()) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
@@ -5172,6 +5538,31 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (TreeEntry *Entry : RemovedOperations) {
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [&](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5679,7 +6070,9 @@
 
   R.computeMinimumValueSizes();
 
-  int Cost = R.getTreeCost();
+  std::pair<int, int> C = R.getTreeCost();
+  int Cost = C.first;
+  int CostSum = C.second;
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
@@ -5695,6 +6088,15 @@
 
     R.vectorizeTree();
     return true;
+  } else {
+    if (SLPThrottling) {
+      Optional<int> PartialCost = R.findSubTree(CostSum);
+      if (PartialCost.hasValue() &&
+          PartialCost.getValue() < -SLPCostThreshold) {
+        R.vectorizeTree();
+        return true;
+      }
+    }
   }
 
   return false;
@@ -5928,23 +6330,34 @@
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost() - UserCost;
+      std::pair<int, int> C = R.getTreeCost();
+      int Cost = C.first - UserCost;
+      int CostSum = C.second;
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
-                                                    cast<Instruction>(Ops[0]))
-                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
-                                 << " and with tree size "
-                                 << ore::NV("TreeSize", R.getTreeSize()));
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
 
         R.vectorizeTree();
         // Move to the next bundle.
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else {
+        if (SLPThrottling) {
+          Optional<int> PartialCost = R.findSubTree(CostSum);
+          if (PartialCost.hasValue() &&
+              PartialCost.getValue() < -SLPCostThreshold) {
+            R.vectorizeTree();
+            Changed = true;
+          }
+        }
       }
     }
   }
@@ -6726,7 +7139,7 @@
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      int TreeCost = V.getTreeCost();
+      int TreeCost = V.getTreeCost().first;
       int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       int Cost = TreeCost + ReductionCost;
       if (Cost >= -SLPCostThreshold) {
Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -34,21 +34,21 @@
 
 define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
-; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
-; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
-; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
-; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
-; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll
@@ -24,53 +24,59 @@
 ; CHECK-NEXT:    [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[CONV]], -128
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP5]] to i32
-; CHECK-NEXT:    [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128
-; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[CONV3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], <i32 -128, i32 -128>
 ; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i32 128, [[CONV]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]]
-; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]]
-; CHECK-NEXT:    [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[SUB12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[SUB7]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[CONV15:%.*]] = zext i8 [[TMP15]] to i32
 ; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]]
 ; CHECK-NEXT:    [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17]], i8* [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[ADD_PTR18]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP16]], 0
 ; CHECK-NEXT:    [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32
-; CHECK-NEXT:    [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128
-; CHECK-NEXT:    [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[CONV3_1:%.*]] = zext i8 [[TMP18]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[CONV3_1]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[CONV_1]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = add nsw <2 x i32> [[TMP20]], <i32 -128, i32 -128>
 ; CHECK-NEXT:    [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]]
-; CHECK-NEXT:    [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]]
-; CHECK-NEXT:    [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[TMP21]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]]
-; CHECK-NEXT:    [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> undef, i32 [[SUB12_1]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[SUB7_1]], i32 1
+; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP22]], <2 x i32> [[TMP21]], <2 x i32> [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
 ; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV15_1:%.*]] = zext i8 [[TMP28]] to i32
 ; CHECK-NEXT:    [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]]
 ; CHECK-NEXT:    [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
-; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1
+; CHECK-NEXT:    [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP29]], 0
 ; CHECK-NEXT:    [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8
 ; CHECK-NEXT:    store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]]
Index: llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -12,39 +12,45 @@
 ; SSE:       for.body:
 ; SSE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; SSE-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
 ; SSE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; SSE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
 ; SSE-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; SSE-NEXT:    [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]]
-; SSE-NEXT:    [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]]
-; SSE-NEXT:    [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00
-; SSE-NEXT:    [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]]
-; SSE-NEXT:    [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00
-; SSE-NEXT:    [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]]
-; SSE-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00
-; SSE-NEXT:    [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00
-; SSE-NEXT:    [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00
-; SSE-NEXT:    [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00
-; SSE-NEXT:    [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00
-; SSE-NEXT:    [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]]
-; SSE-NEXT:    [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00
-; SSE-NEXT:    [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]]
-; SSE-NEXT:    [[ADD13]] = fadd float [[MUL10]], [[MUL11]]
+; SSE-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> undef, float [[TMP7]], i32 0
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP6]], i32 1
+; SSE-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP5]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
+; SSE-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; SSE-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> undef, float [[TMP16]], i32 0
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP15]], i32 1
+; SSE-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP19]], i32 0
+; SSE-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP19]], i32 1
+; SSE-NEXT:    [[ADD13]] = fadd float [[TMP20]], [[TMP21]]
 ; SSE-NEXT:    [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00
 ; SSE-NEXT:    [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00
 ; SSE-NEXT:    [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00
-; SSE-NEXT:    [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]]
-; SSE-NEXT:    [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00
-; SSE-NEXT:    [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00
+; SSE-NEXT:    [[COND_I44:%.*]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]]
+; SSE-NEXT:    [[CMP_I41:%.*]] = fcmp olt float [[TMP21]], 1.000000e+00
+; SSE-NEXT:    [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[TMP21]], float 1.000000e+00
 ; SSE-NEXT:    [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00
-; SSE-NEXT:    [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]]
+; SSE-NEXT:    [[COND_I40:%.*]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]]
 ; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <2 x float> undef, float [[COND_I40]], i32 0
+; SSE-NEXT:    [[TMP23]] = insertelement <2 x float> [[TMP22]], float [[COND_I44]], i32 1
 ; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; SSE:       for.end:
 ; SSE-NEXT:    ret void
Index: llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -15,18 +15,20 @@
 ; CHECK:       for.body6:
 ; CHECK-NEXT:    br label [[FOR_BODY12:%.*]]
 ; CHECK:       for.body12:
-; CHECK-NEXT:    [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]]
-; CHECK-NEXT:    [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]]
-; CHECK-NEXT:    [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ undef, [[FOR_BODY6]] ], [ [[TMP7:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd double [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]]
-; CHECK-NEXT:    [[ADD19]] = fadd double undef, [[MUL18]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]]
-; CHECK-NEXT:    [[ADD20]] = fadd double undef, [[SUB]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[TMP4]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL18]], i32 1
+; CHECK-NEXT:    [[TMP7]] = fadd <2 x double> undef, [[TMP6]]
 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]]
 ; CHECK:       for.inc21:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]]
Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -67,13 +67,14 @@
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], <2 x i64>* [[TMP17]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry: