Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -114,6 +114,15 @@
 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                    cl::desc("Attempt to vectorize horizontal reductions"));
 
+static cl::opt<bool>
+SLPThrottling("slp-throttling", cl::init(true), cl::Hidden,
+                   cl::desc("Enable tree partial vectorize with throttling"));
+
+static cl::opt<int>
+MaxCostsRecalculations("slp-throttling-budget", cl::init(500), cl::Hidden,
+                   cl::desc("Limit the total number of nodes for cost "
+                            "recalculations during throttling"));
+
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
     cl::desc(
@@ -524,11 +533,35 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost() const;
+  int getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) const;
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec);
+
+  /// \returns the cost of gathering canceled elements to be used
+  /// by vectorized  operations during throtelling.
+  int getInsertCost(const SmallPtrSetImpl<Value *> &VecToScalars);
+
+  /// Determine the cost of partail vectorization.
+  int getSubTreeCost(ArrayRef<unsigned> VecNodes);
+
+  /// Does a BFS for non-gathering nodes of SLP graph.
+  void BFS();
+
+  /// Find the subgraph of the graph that is profitable to vectorize.
+  int findSubTree();
+
+  /// Cut the branch untill it is profitable to vectorize.
+  unsigned cutBranch(unsigned LeafIdx, unsigned L,
+                     ArrayRef<unsigned> VecNodes, int &Cost);
+
+  /// Try to extend the subgraph that makes sense to vectorize from
+  /// a cost perspective.
+  int extendSubTree(unsigned LeafIdx, ArrayRef<unsigned> VecNodes);
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost();
+  int getTreeCost(bool cutTree = false);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -549,6 +582,8 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    InternalTreeUses.clear();
+    RemovedOperations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -556,6 +591,11 @@
       BS->clear();
     }
     MinBWs.clear();
+    UseTreeIndices.clear();
+    Nodes.clear();
+    NodesList.clear();
+    EntriesCosts.clear();
+    CostsRecalculations = 0;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -599,6 +639,9 @@
     return MinVecRegSize;
   }
 
+  /// Save seed instructions to try partially vectorize later.
+  void recordSeeds(ArrayRef<Value *> Ops);
+
   /// Check if ArrayType or StructType is isomorphic to some VectorType.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
@@ -608,6 +651,12 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Cut the tree to make it partially vectorizable.
+  void cutTree(ArrayRef<unsigned> VecNodes);
+
+  /// Try partially vectorize the tree via throttling.
+  bool tryPartialVectorization();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -1315,6 +1364,9 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<int, 2> RemovedOperations;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -1334,6 +1386,25 @@
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
+  struct Node {
+    Node(unsigned I) : Idx(I) {}
+
+    /// Index of this node in the graph.
+    unsigned Idx = 0;
+
+    /// A previous node along the way to this node.
+    unsigned Prev = 0;
+
+    /// Color of this node.
+    unsigned Color = 0;
+
+    /// True if this node is a leaf in the graph.
+    bool IsLeaf = false;
+  };
+
+  /// \Returns the cost of extracting the vectorized elements.
+  int getExtractOperationCost(ExternalUser &EU);
+
   /// Checks if two instructions may access the same memory.
   ///
   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
@@ -1385,6 +1456,26 @@
   /// after vectorization.
   UserList ExternalUses;
 
+  /// The array of non-gathering nodes of the graph
+  SmallVector<std::unique_ptr<Node>, 8> NodesList;
+
+  /// The array of costs for each tree entry.
+  SmallVector<int, 4> EntriesCosts;
+
+  /// Map non-gathering nodes.
+  DenseMap<unsigned, Node *> Nodes;
+
+  /// List of all seeds instructions that were not vectorized, that
+  /// we will try to vectorize with patial vectorization.
+  SmallVector<std::unique_ptr<SmallVector<Value *, 4>>, 8> Seeds;
+
+  /// Number of times in nodes that we already recalulated cost of
+  /// the subgraph during throtteling.
+  int CostsRecalculations = 0;
+
+  /// Internal tree proposed to vectorized values use in that tree.
+  SmallDenseMap<Value*, UserList> InternalTreeUses;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -1394,6 +1485,9 @@
   /// A list of blocks that we are going to CSE.
   SetVector<BasicBlock *> CSEBlocks;
 
+  /// The index containing the use of this entry by other entries.
+  SmallDenseMap<unsigned, SmallVector<unsigned, 2>> UseTreeIndices;
+
   /// Contains all scheduling relevant data for an instruction.
   /// A ScheduleData either represents a single instruction or a member of an
   /// instruction bundle (= a group of instructions which is combined into a
@@ -1743,6 +1837,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -1948,6 +2045,7 @@
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(!UseEntry->NeedToGather && "Bad state");
+            InternalTreeUses[U].push_back(ExternalUser(Scalar, U, FoundLane));
             continue;
           }
         }
@@ -2566,6 +2664,91 @@
   }
 }
 
+void BoUpSLP::cutTree(ArrayRef<unsigned> VecNodes) {
+  SmallPtrSet<Value*, 4> Removed;
+  // Canceling unprofitable elements.
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    if (!is_contained(VecNodes, I)) {
+      Entry->NeedToGather = true;
+      for (Value *V : Entry->Scalars) {
+        LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                          << " out of proposed to vectorize.\n");
+        ScalarToTreeEntry.erase(V);
+        Removed.insert(V);
+        RemovedOperations.push_back(I);
+        MustGather.insert(V);
+        ExternalUses.erase(
+            std::remove_if(ExternalUses.begin(), ExternalUses.end(),
+                           [&](ExternalUser &EU) { return EU.Scalar == V; }),
+            ExternalUses.end());
+      }
+    }
+  }
+  // For all canceled operations we should consider the possibility of
+  // use by with non-canceled operations and for that, it requires
+  // to populate ExternalUser list with canceled elements.
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    if (is_contained(VecNodes, I)) {
+      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+        Value *Scalar = Entry->Scalars[Lane];
+        for (User *U : Scalar->users()) {
+          LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+          Instruction *UserInst = dyn_cast<Instruction>(U);
+          if (!UserInst)
+            continue;
+          if (!Removed.count(U))
+            continue;
+          // Ignore users in the user ignore list.
+          if (is_contained(UserIgnoreList, UserInst))
+            continue;
+          LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :"
+                            << *U << " from lane " << Lane
+                            << " from " << *Scalar << ".\n");
+          ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+        }
+      }
+    }
+  }
+}
+
+bool BoUpSLP::tryPartialVectorization() {
+  bool Changed = false;
+  for (unsigned I = 0; I < Seeds.size(); ++I) {
+    auto &S = *Seeds[I].get();
+    // Check those seed instructions are still alive.
+    if (std::any_of(S.begin(), S.end(), [](Value *V) {
+          return (!(cast<Instruction>(V))->getOperand(0));
+        }))
+      continue;
+
+    buildTree(S);
+
+    // If other part BB were vectorized the tree might not be
+    // enough interest to look.
+    if (isTreeTinyAndNotFullyVectorizable())
+      continue;
+
+    int Cost = getTreeCost(true);
+    if (Cost < -SLPCostThreshold) {
+      vectorizeTree();
+      Changed = true;
+    }
+  }
+  Seeds.clear();
+  return Changed;
+}
+
+void BoUpSLP::recordSeeds(ArrayRef<Value *> Ops) {
+  Seeds.push_back(
+      llvm::make_unique<SmallVector<Value *, 4>>(Ops.begin(), Ops.end()));
+}
+
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N;
   Type *EltTy;
@@ -3073,7 +3256,7 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() const {
+int BoUpSLP::getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) const {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
@@ -3097,7 +3280,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && ScalarsToVec.count(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -3138,15 +3321,302 @@
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
+int BoUpSLP::getExtractOperationCost(ExternalUser &EU) {
+  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+
+  // Uses by ephemeral values are free (because the ephemeral value will be
+  // removed prior to code generation, and so the extraction will be
+  // removed as well).
+  if (EphValues.count(EU.User))
+    return 0;
+
+  // If we plan to rewrite the tree in a smaller type, we will need to sign
+  // extend the extracted value back to the original type. Here, we account
+  // for the extract and the added cost of the sign extend if needed.
+  auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+
+  if (MinBWs.count(ScalarRoot)) {
+    auto *MinTy = IntegerType::get(F->getContext(),
+                                   MinBWs[ScalarRoot].first);
+    auto Extend =
+      MinBWs[ScalarRoot].second ?
+        Instruction::SExt : Instruction::ZExt;
+    VecTy = VectorType::get(MinTy, BundleWidth);
+    return (TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                               VecTy, EU.Lane));
+  }
+  return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+}
+
+int BoUpSLP::getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) {
+  int ExtractCost = 0;
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  for (ExternalUser &EU : ExternalUses) {
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // Avoid non-vectorized scalars.
+    if (!ScalarsToVec.count(EU.Scalar)) {
+      // Consider the possibility of extracting vectorized
+      // values for canceled elements use.
+      if (InternalTreeUses.find(EU.Scalar) != InternalTreeUses.end())
+        for (ExternalUser &IU : InternalTreeUses[EU.Scalar])
+          ExtractCost += getExtractOperationCost(IU);
+      continue;
+    }
+
+    ExtractCost += getExtractOperationCost(EU);
+  }
+  return ExtractCost;
+}
+
+int BoUpSLP::getInsertCost(const SmallPtrSetImpl<Value *> &VecToScalars) {
+  int InsertCost = 0;
+  for (unsigned I = 0; I < VectorizableTree.size(); I++) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    for (Value *V : Entry->Scalars) {
+      auto *Inst = cast<Instruction>(V);
+      for (Use &U : Inst->operands()) {
+        Value *Op = U.get();
+        if (VecToScalars.count(Op))
+          InsertCost += getGatherCost(Op);
+      }
+    }
+  }
+  return InsertCost;
+}
+
+int BoUpSLP::getSubTreeCost(ArrayRef<unsigned> VecNodes) {
   int Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+  int ExtractCost = 0;
+  int SpillCost = 0;
+  SmallPtrSet<Value *, 4> ScalarsToVec;
+  SmallPtrSet<Value *, 4> VecToScalars;
+
+#ifndef NDEBUG
+  for (unsigned N : VecNodes) {
+    if (!N)
+      continue;
+    do {
+      N = Nodes[N]->Prev;
+      assert(is_contained(VecNodes, N) &&
+             "An incorrect gap in indices according to the tree structure");
+    } while (N != 0);
+  }
+#endif
+
+  CostsRecalculations += VectorizableTree.size();
+
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    if (is_contained(VecNodes, I)) {
+      for (Value *V : Entry->Scalars)
+        ScalarsToVec.insert(V);
+      int GatherCost = 0;
+      auto It = UseTreeIndices.find(I);
+      if (It != UseTreeIndices.end())
+        for (int Gather : It->second)
+          if (VectorizableTree[Gather].NeedToGather)
+            GatherCost += EntriesCosts[Gather];
+      Cost += EntriesCosts[I] + GatherCost;
+    } else {
+      for (Value *V : Entry->Scalars)
+        VecToScalars.insert(V);
+    }
+  }
+
+  int Inserts = getInsertCost(VecToScalars);
+  Cost += Inserts;
+  ExtractCost = getExtractCost(ScalarsToVec);
+  Cost += ExtractCost;
+  SpillCost = getSpillCost(ScalarsToVec);
+  Cost += SpillCost;
+
+  return Cost;
+}
+
+void BoUpSLP::BFS() {
+  std::vector<unsigned> Worklist;
+  Worklist.push_back(0);
+  Nodes[0]->Color = 1;
+  do {
+    unsigned Current = Worklist.back();
+    Worklist.pop_back();
+    bool isLeaf = true;
+    for (unsigned Next : UseTreeIndices[Current]) {
+      if (Next == Current)
+        continue;
+      if (VectorizableTree[Next].NeedToGather)
+        continue;
+      isLeaf = false;
+      // To simplify the algorithm, consider
+      // the top node of a cycle as a leaf.
+      if (Nodes[Next]->Color != 0) {
+        isLeaf = true;
+      } else {
+        Worklist.push_back(Next);
+        Nodes[Next]->Color = 1;
+        Nodes[Next]->Prev = Current;
+      }
+    }
+    if (isLeaf && Current)
+      Nodes[Current]->IsLeaf = true;
+  } while (!Worklist.empty());
+}
+
+unsigned BoUpSLP::cutBranch(unsigned LeafIdx, unsigned L,
+                            ArrayRef<unsigned> VecNodes, int &Cost) {
+  SmallVector<unsigned, 2> Vecs;
+  SmallVector<unsigned, 2> Path;
+
+  for (unsigned V : VecNodes)
+    Vecs.push_back(V);
+  assert(Nodes[LeafIdx]->IsLeaf && "This suppose to be a leaf node");
+  assert(Nodes[L]->IsLeaf && "This suppose to be a leaf node");
+
+  unsigned I = L;
+  unsigned RealOpAt = UINT_MAX;
+  do {
+    if (Nodes[I]->Color == LeafIdx)
+      break;
+    Path.push_back(I);
+    I = Nodes[I]->Prev;
+  } while (I != 0);
+  if (I == 0)
+    Path.push_back(0);
+  // Find the first entry with worth operations to vectorize.
+  for (I = Path.size(); I--;) {
+    unsigned Idx = Nodes[Path[I]]->Idx;
+    InstructionsState S = getSameOpcode(VectorizableTree[Idx].Scalars);
+    auto *Inst = S.MainOp;
+    if (Inst && (isa<BinaryOperator>(Inst) || isa<FPMathOperator>(Inst) ||
+                 isa<CmpInst>(Inst))) {
+      RealOpAt = I;
+      break;
+    }
+  }
+  if (RealOpAt == UINT_MAX)
+    return UINT_MAX;
+  for (I = 0; I < Path.size(); I++) {
+    SmallVector<unsigned, 2> SubTree(Vecs);
+    SubTree.insert(SubTree.end(), Path.begin() + I, Path.end());
+    if (MaxCostsRecalculations < CostsRecalculations || SubTree.size() <= 1 ||
+        I > RealOpAt)
+      break;
+    Cost = getSubTreeCost(SubTree);
+    if (Cost < -SLPCostThreshold)
+      return Path[I];
+  }
+  return UINT_MAX;
+}
+
+int BoUpSLP::extendSubTree(unsigned LeafIdx, ArrayRef<unsigned> VecNodes) {
+  int SubTreeCost = 0;
+  SmallVector<unsigned, 2> Vecs;
+  SmallVector<unsigned, 2> Path;
+  for (unsigned V : VecNodes) {
+    Vecs.push_back(V);
+    Path.push_back(V);
+  }
+  assert(Nodes[LeafIdx]->IsLeaf && "This suppose to be a leaf node");
+
+  // Add leaves from the graph along the way to the bottom of
+  // the graph if profitable.
+  for (auto &N : Nodes) {
+    unsigned L = N.first;
+    if (Nodes[L]->Color == LeafIdx || !Nodes[L]->IsLeaf)
+      continue;
+    assert(Nodes[L]->IsLeaf && "This suppose to be a leaf node");
+    int Cost;
+    unsigned Node = cutBranch(LeafIdx, L, Vecs, Cost);
+    if (Node != UINT_MAX) {
+      SmallVector<unsigned, 2> BranchPath;
+      unsigned I = Node;
+      do {
+        if (Nodes[I]->Color == LeafIdx)
+          break;
+        BranchPath.push_back(I);
+        I = Nodes[I]->Prev;
+      } while (I != 0);
+      Vecs.insert(Vecs.end(), BranchPath.begin(), BranchPath.end());
+      for (unsigned V : BranchPath)
+        Nodes[V]->Color = LeafIdx;
+      SubTreeCost = Cost;
+    }
+  }
 
-  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+  if (SubTreeCost < -SLPCostThreshold) {
+    assert(Vecs.size() < NodesList.size() &&
+           "Incorrect number to vectorize nodes");
+    LLVM_DEBUG(dbgs() << "SLP: Reduced the number of elements to "
+                      << Vecs.size() << " in the tree with " << SubTreeCost
+                      << " cost to make it partially vectorizable.\n");
+    cutTree(Vecs);
+    return SubTreeCost;
+  }
+  return INT_MAX;
+}
 
+int BoUpSLP::findSubTree() {
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    if (VectorizableTree[I].NeedToGather)
+      continue;
+    NodesList.push_back(llvm::make_unique<Node>(I));
+    Nodes[I] = NodesList.back().get();
+    assert(I == Nodes[I]->Idx && "Incorrect node index");
+  }
+  BFS();
+
+  // Walk by graph leaves and reduce a branch height
+  // to find any profitable part.
+  for (auto &N : Nodes) {
+    if (!N.second->IsLeaf)
+      continue;
+    unsigned L = N.second->Idx;
+    SmallVector<unsigned, 2> Path;
+    int Cost;
+    unsigned I = cutBranch(L, L, Path, Cost);
+    if (I != UINT_MAX) {
+      do {
+        Path.push_back(I);
+        Nodes[I]->Color = L;
+        I = Nodes[I]->Prev;
+      } while (I != 0);
+      if (Path.back() != 0) {
+        Nodes[0]->Color = L;
+        Path.push_back(0);
+      }
+      // Try to extend the already profitable part to maximize
+      // the number of operations to vectorize.
+      int CostWithLeafs = extendSubTree(L, Path);
+      if (CostWithLeafs == INT_MAX) {
+        LLVM_DEBUG(dbgs() << "SLP: Reduced the number of elements to "
+                          << Path.size() << " in the tree with " << Cost
+                          << " cost to make it partially vectorizable.\n");
+        cutTree(Path);
+        return Cost;
+      }
+      return CostWithLeafs;
+    }
+  }
+  return INT_MAX;
+}
+
+int BoUpSLP::getTreeCost(bool cutTree) {
+  SmallPtrSet<Value *, 4> ScalarsToVec;
+  SmallPtrSet<Value *, 4> VecToScalars;
+  int CostSum = 0;
+  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+                    << VectorizableTree.size() << ".\n");
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = VectorizableTree[I];
+    EntriesCosts.push_back(0);
 
     // We create duplicate tree entries for gather sequences that have multiple
     // uses. However, we should not compute the cost of duplicate sequences.
@@ -3167,56 +3637,47 @@
                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    for (auto &UseTreeIdx : TE.UserTreeIndices)
+      UseTreeIndices[UseTreeIdx.Idx].push_back(I);
+
+    if (!TE.NeedToGather) {
+      for (Value *V : TE.Scalars)
+        ScalarsToVec.insert(V);
+    }
+
+    EntriesCosts[I] = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << EntriesCosts[I]
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
+
+    CostSum += EntriesCosts[I];
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
   int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
-
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+  int SpillCost = 0;
+  int Cost = CostSum;
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
-    }
-  }
-
-  int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  ExtractCost = getExtractCost(ScalarsToVec);
+  Cost += ExtractCost;
+  SpillCost = getSpillCost(ScalarsToVec);
+  Cost += SpillCost;
 
   std::string Str;
-  {
-    raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
+  if (SLPThrottling && cutTree && Cost >= -SLPCostThreshold) {
+    Cost = findSubTree();
+    {
+      raw_string_ostream OS(Str);
+      OS << "SLP: Partial vectorization with Total Cost = " << Cost << ".\n";
+    }
+  } else {
+    {
+      raw_string_ostream OS(Str);
+      OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+         << "SLP: Extract Cost = " << ExtractCost << ".\n"
+         << "SLP: Total Cost = " << Cost << ".\n";
+    }
   }
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 
@@ -3935,7 +4396,12 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (RemovedOperations.size())
+      removeFromScheduling(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -4064,13 +4530,16 @@
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
 #ifndef NDEBUG
-        for (User *U : Scalar->users()) {
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
-          // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
-        }
+        // The tree might not be fully vectorized, so we don't have to
+        // check every user.
+        if (!RemovedOperations.size())
+          for (User *U : Scalar->users()) {
+            LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+            // It is legal to replace users in the ignorelist by undef.
+            assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+                   "Replacing out-of-tree value with undef");
+          }
 #endif
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
@@ -4547,6 +5016,32 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS) {
+  bool Removed = false;
+  for (int I : RemovedOperations) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (!Removed)
+    return;
+  BS->resetSchedule();
+  BS->initialFillReadyList(BS->ReadyInsts);
+  for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    BS->doForAllOpcodes(I,
+                        [&](ScheduleData *SD) { SD->clearDependencies(); });
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -5015,6 +5510,12 @@
                         << " underlying objects.\n");
       Changed |= vectorizeGEPIndices(BB, R);
     }
+
+    // Partially vectorize trees after all full vectorization is done,
+    // otherwise, we could prevent more profitable full vectorization with
+    // smaller vector sizes.
+    if (SLPThrottling)
+      Changed |= R.tryPartialVectorization();
   }
 
   if (Changed) {
@@ -5090,6 +5591,8 @@
       // Move to the next bundle.
       i += VF - 1;
       Changed = true;
+    } else {
+      R.recordSeeds(Operands);
     }
   }
 
@@ -5319,6 +5822,8 @@
         I += VF - 1;
         NextInst = I + 1;
         Changed = true;
+      } else {
+        R.recordSeeds(Ops);
       }
     }
   }
Index: test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -12,21 +12,18 @@
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
-; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
-; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
Index: test/Transforms/SLPVectorizer/AArch64/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -28,39 +28,42 @@
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_LR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_026]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[TMP9]]
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD]], undef
 ; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[ADD11]], undef
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[ADD19]], undef
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_025]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> undef, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 1, i32 1
+; CHECK-NEXT:    [[TMP13]] = add nsw <2 x i32> [[TMP12]], [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP14]], [[H]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP15]], [[FOR_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
 ;
 entry:
Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -39,7 +39,6 @@
 ; CHECK-LABEL: @store_chain_v2i64(
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
 ; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
@@ -48,10 +47,13 @@
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
-; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP1_0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP1_1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
Index: test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -12,38 +12,37 @@
 ; SSE:       for.body:
 ; SSE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; SSE-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ]
 ; SSE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; SSE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
 ; SSE-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; SSE-NEXT:    [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]]
-; SSE-NEXT:    [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]]
-; SSE-NEXT:    [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00
-; SSE-NEXT:    [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]]
-; SSE-NEXT:    [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00
-; SSE-NEXT:    [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]]
-; SSE-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00
-; SSE-NEXT:    [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00
-; SSE-NEXT:    [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00
-; SSE-NEXT:    [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00
-; SSE-NEXT:    [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00
-; SSE-NEXT:    [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]]
-; SSE-NEXT:    [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00
-; SSE-NEXT:    [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]]
-; SSE-NEXT:    [[ADD13]] = fadd float [[MUL10]], [[MUL11]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; SSE-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
+; SSE-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
+; SSE-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; SSE-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
+; SSE-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; SSE-NEXT:    [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
 ; SSE-NEXT:    [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00
 ; SSE-NEXT:    [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00
-; SSE-NEXT:    [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]]
-; SSE-NEXT:    [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00
-; SSE-NEXT:    [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00
-; SSE-NEXT:    [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00
-; SSE-NEXT:    [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]]
+; SSE-NEXT:    [[CMP_I41:%.*]] = fcmp olt float [[TMP17]], 1.000000e+00
+; SSE-NEXT:    [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[TMP17]], float 1.000000e+00
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[COND_I42]], i32 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[COND_I46]], i32 1
+; SSE-NEXT:    [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float -1.000000e+00, float -1.000000e+00>
+; SSE-NEXT:    [[TMP21]] = select <2 x i1> [[TMP20]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP19]]
 ; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
 ; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; SSE:       for.end:
Index: test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -15,18 +15,20 @@
 ; CHECK:       for.body6:
 ; CHECK-NEXT:    br label [[FOR_BODY12:%.*]]
 ; CHECK:       for.body12:
-; CHECK-NEXT:    [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]]
-; CHECK-NEXT:    [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]]
-; CHECK-NEXT:    [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ undef, [[FOR_BODY6]] ], [ [[TMP7:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd double [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]]
-; CHECK-NEXT:    [[ADD19]] = fadd double undef, [[MUL18]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]]
-; CHECK-NEXT:    [[ADD20]] = fadd double undef, [[SUB]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[TMP4]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL18]], i32 1
+; CHECK-NEXT:    [[TMP7]] = fadd <2 x double> undef, [[TMP6]]
 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]]
 ; CHECK:       for.inc21:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]]
Index: test/Transforms/SLPVectorizer/X86/lookahead.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -30,14 +30,18 @@
 ; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
 ; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
 ; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
-; CHECK-NEXT:    [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
 ; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
 ; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
-; CHECK-NEXT:    [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
-; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]]
-; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]]
-; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
-; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C_1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[D_1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUBCD_0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[SUBAB_1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,20 @@
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP3]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ;
 entry: