Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -516,11 +516,14 @@
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
-  int getSpillCost();
+  int getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec);
+
+  /// \returns the cost extracting vectorized elements.
+  int getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec);
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost();
+  int getTreeCost(bool ReduceTree = false);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -541,6 +544,7 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
+    RemovedOprations.clear();
     NumOpsWantToKeepOrder.clear();
     NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
@@ -600,6 +604,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  /// Reduce the cost of tree to make it patrially vectorizable, if possible.
+  void reduceTree();
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
 private:
@@ -612,7 +619,7 @@
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(Value *Parent, ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -626,7 +633,7 @@
   Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  Value *vectorizeTree(Value *Parent, ArrayRef<Value *> VL);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -700,15 +707,26 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Cost of the tree entry.
+    int Cost = 0;
+
+    /// Full cost on expanding tree at this hight.
+    int ExpandAtCost = 0;
+
+    /// Parent operatrion of this operations.
+    Value *Parent = nullptr;
   };
 
   /// Create a new VectorizableTree entry.
-  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+  void newTreeEntry(Value *Parent, ArrayRef<Value *> VL, bool Vectorized,
+                    int &UserTreeIdx,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
+    Last->Parent = Parent;
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -742,6 +760,9 @@
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
+  /// Tree entries that should not be vectorized due to throttling.
+  SmallVector<int, 2> RemovedOprations;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -1170,6 +1191,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove operations from the list of proposed to schedule.
+  void removeFromScheduling(BlockScheduling *BS, ArrayRef<int> Oprations);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -1330,7 +1354,7 @@
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0, -1);
+  buildTree_rec(Roots[0], Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -1391,35 +1415,35 @@
   }
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(Parent, VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(Parent, VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(Parent, VL, false, UserTreeIdx);
     return;
   }
 
@@ -1431,7 +1455,7 @@
     if (EphValues.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1441,7 +1465,7 @@
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -1460,7 +1484,7 @@
     if (getTreeEntry(I)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1471,7 +1495,7 @@
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1485,7 +1509,7 @@
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(Parent, VL, false, UserTreeIdx);
     return;
   }
 
@@ -1505,7 +1529,7 @@
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(Parent, VL, false, UserTreeIdx);
       return;
     }
     VL = UniqueValues;
@@ -1522,7 +1546,7 @@
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1543,12 +1567,12 @@
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1558,7 +1582,7 @@
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1569,7 +1593,7 @@
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx,
                      ReuseShuffleIndicies);
         return;
       }
@@ -1586,12 +1610,12 @@
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
       BS.cancelScheduling(VL, VL0);
       return;
     }
@@ -1607,7 +1631,7 @@
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1620,7 +1644,7 @@
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1650,14 +1674,14 @@
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
@@ -1667,7 +1691,7 @@
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1687,13 +1711,13 @@
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1702,7 +1726,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1716,14 +1740,14 @@
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1732,7 +1756,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1755,7 +1779,7 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1763,8 +1787,8 @@
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1774,7 +1798,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1784,7 +1808,7 @@
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1798,7 +1822,7 @@
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1810,12 +1834,12 @@
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1823,7 +1847,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1832,19 +1856,19 @@
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1855,7 +1879,7 @@
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1869,7 +1893,7 @@
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                             << "\n");
           return;
@@ -1880,7 +1904,7 @@
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
@@ -1892,14 +1916,14 @@
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
                             << *CI << "!=" << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1907,7 +1931,7 @@
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1916,19 +1940,19 @@
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(S, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1938,13 +1962,13 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2413,6 +2437,70 @@
   }
 }
 
+void BoUpSLP::reduceTree() {
+  // Estimating where to stop vectorization.
+  unsigned StopAt = VectorizableTree.size() - 1;
+  int MinCost = VectorizableTree.back().ExpandAtCost;
+  for (unsigned I = VectorizableTree.size(); I--;) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    if (VectorizableTree[I].ExpandAtCost < MinCost) {
+      MinCost = VectorizableTree[I].ExpandAtCost;
+      StopAt = I;
+    }
+  }
+
+  if (StopAt == (VectorizableTree.size() - 1))
+    return;
+
+  // Canceling unprofitable elements.
+  int ReducedBy = MinCost - VectorizableTree.back().ExpandAtCost;
+  LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << ReducedBy
+                    << " to make it partially vectorizable.\n");
+  for (unsigned I = StopAt + 1, E = VectorizableTree.size(); I < E; I++) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    Entry->NeedToGather = true;
+    for (Value *V : Entry->Scalars) {
+      LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V
+                        << " out of proposed to vectorize.\n");
+      RemovedOprations.push_back(I);
+      ScalarToTreeEntry.erase(V);
+      MustGather.insert(V);
+      ExternalUses.erase(
+          std::remove_if(ExternalUses.begin(), ExternalUses.end(),
+                         [&](ExternalUser &EU) { return EU.Scalar == V; }),
+          ExternalUses.end());
+    }
+  }
+
+  // Add external users for canceled elements.
+  for (unsigned I = 0, E = StopAt; I <= E; I++) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    if (Entry->NeedToGather)
+      continue;
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+      for (User *U : Scalar->users()) {
+        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+        Instruction *UserInst = dyn_cast<Instruction>(U);
+        if (!UserInst)
+          continue;
+        if (getTreeEntry(U))
+          continue;
+        // Ignore users in the user ignore list.
+        if (is_contained(UserIgnoreList, UserInst))
+          continue;
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+                          << Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+      }
+    }
+  }
+}
+
 bool BoUpSLP::isFullyVectorizableTinyTree() {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
                     << VectorizableTree.size() << " is fully vectorizable .\n");
@@ -2457,7 +2545,7 @@
   return true;
 }
 
-int BoUpSLP::getSpillCost() {
+int BoUpSLP::getSpillCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
   // query TTI to see if there is a cost to keeping values live over it
@@ -2481,7 +2569,7 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+      if (isa<Instruction>(&*J) && ScalarsToVec.count(&*J) > 0)
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -2512,23 +2600,53 @@
           V.push_back(VectorType::get(II->getType(), BundleWidth));
         Cost += TTI->getCostOfKeepingLiveOverCall(V);
       }
-
       ++PrevInstIt;
     }
-
     PrevInst = Inst;
   }
 
   return Cost;
 }
 
-int BoUpSLP::getTreeCost() {
-  int Cost = 0;
-  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
-                    << VectorizableTree.size() << ".\n");
+int BoUpSLP::getExtractCost(const SmallPtrSetImpl<Value *> &ScalarsToVec) {
+  int ExtractCost = 0;
+  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  for (ExternalUser &EU : ExternalUses) {
 
-  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+    // Avoid non-vectorized scalars for this tree hight.
+    if (ScalarsToVec.count(EU.Scalar) == 0)
+      continue;
+
+    // We only add extract cost once for the same scalar.
+    if (!ExtractCostCalculated.insert(EU.Scalar).second)
+      continue;
+
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    if (MinBWs.count(ScalarRoot)) {
+      auto *MinTy = IntegerType::get(F->getContext(),
+                                     MinBWs[ScalarRoot].first);
+      auto Extend =
+          MinBWs[ScalarRoot].second ?
+            Instruction::SExt : Instruction::ZExt;
+      VecTy = VectorType::get(MinTy, BundleWidth);
+      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                               VecTy, EU.Lane);
+    } else {
+      ExtractCost +=
+           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    }
+  }
+  return ExtractCost;
+}
 
+int BoUpSLP::getTreeCost(bool ReduceTree) {
+  SmallDenseMap<Value *, SmallVector<int, 8>> GatherMap;
+  unsigned NonGatherNum = 0;
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = VectorizableTree[I];
 
@@ -2551,60 +2669,77 @@
                     }))
       continue;
 
-    int C = getEntryCost(&TE);
-    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+    if (TE.NeedToGather) {
+      GatherMap[TE.Parent].push_back(I);
+    } else {
+      NonGatherNum++;
+    }
+
+    TE.Cost = getEntryCost(&TE);
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost
                       << " for bundle that starts with " << *TE.Scalars[0]
                       << ".\n");
-    Cost += C;
   }
 
-  SmallPtrSet<Value *, 16> ExtractCostCalculated;
+  SmallPtrSet<Value *, 4> ScalarsToVec;
+  int CostSum = 0;
+  int MinCost = SLPCostThreshold;
+  int MinExtractCost = 0;
+  int MinSpillCost = 0;
   int ExtractCost = 0;
-  for (ExternalUser &EU : ExternalUses) {
-    // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(EU.Scalar).second)
-      continue;
+  int SpillCost = 0;
+  unsigned NonGatherIdx = 0;
 
-    // Uses by ephemeral values are free (because the ephemeral value will be
-    // removed prior to code generation, and so the extraction will be
-    // removed as well).
-    if (EphValues.count(EU.User))
-      continue;
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) {
+    TreeEntry *Entry = &VectorizableTree[I];
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
-    if (MinBWs.count(ScalarRoot)) {
-      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-      auto Extend =
-          MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+    int GatherCost = 0;
+    Entry->ExpandAtCost = 0;
+    if (!Entry->NeedToGather) {
+      NonGatherIdx++;
+      for (Value *V : Entry->Scalars) {
+        ScalarsToVec.insert(V);
+        if (GatherMap.find(V) != GatherMap.end())
+          for (int Gather : GatherMap[V])
+            GatherCost += VectorizableTree[Gather].Cost;
+      }
+      if (NonGatherIdx != NonGatherNum)
+        Entry->ExpandAtCost += getGatherCost(Entry->Scalars);
+      CostSum += Entry->Cost + GatherCost;
+    }
+    Entry->ExpandAtCost += CostSum;
+
+    int ExtractCost = getExtractCost(ScalarsToVec);
+    Entry->ExpandAtCost += ExtractCost;
+
+    int SpillCost = getSpillCost(ScalarsToVec);
+    Entry->ExpandAtCost += SpillCost;
+
+    if (ReduceTree && MinCost > Entry->ExpandAtCost && !Entry->NeedToGather) {
+      MinCost = Entry->ExpandAtCost;
+      MinExtractCost = ExtractCost;
+      MinSpillCost = SpillCost;
     }
   }
 
-  int SpillCost = getSpillCost();
-  Cost += SpillCost + ExtractCost;
+  if (!ReduceTree) {
+    MinCost = VectorizableTree.back().ExpandAtCost;
+    MinExtractCost = ExtractCost;
+    MinSpillCost = SpillCost;
+  }
 
   std::string Str;
   {
     raw_string_ostream OS(Str);
-    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
-       << "SLP: Extract Cost = " << ExtractCost << ".\n"
-       << "SLP: Total Cost = " << Cost << ".\n";
+    OS << "SLP: Spill Cost = " << MinSpillCost << ".\n"
+       << "SLP: Extract Cost = " << MinExtractCost << ".\n"
+       << "SLP: Total Cost = " << MinCost << ".\n";
   }
   LLVM_DEBUG(dbgs() << Str);
-
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 
-  return Cost;
+  return MinCost;
 }
 
 int BoUpSLP::getGatherCost(Type *Ty,
@@ -2950,7 +3085,7 @@
   return Vec;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+Value *BoUpSLP::vectorizeTree(Value *Parent, ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.getOpcode()) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
@@ -3088,7 +3223,7 @@
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands);
+        Value *Vec = vectorizeTree(S.OpValue, Operands);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -3183,7 +3318,7 @@
 
       setInsertPointAfterBundle(E->Scalars, S);
 
-      Value *InVec = vectorizeTree(INVL);
+      Value *InVec = vectorizeTree(S.OpValue, INVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3210,8 +3345,8 @@
 
       setInsertPointAfterBundle(E->Scalars, S);
 
-      Value *L = vectorizeTree(LHSV);
-      Value *R = vectorizeTree(RHSV);
+      Value *L = vectorizeTree(S.OpValue, LHSV);
+      Value *R = vectorizeTree(S.OpValue, RHSV);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3244,9 +3379,9 @@
 
       setInsertPointAfterBundle(E->Scalars, S);
 
-      Value *Cond = vectorizeTree(CondVec);
-      Value *True = vectorizeTree(TrueVec);
-      Value *False = vectorizeTree(FalseVec);
+      Value *Cond = vectorizeTree(S.OpValue, CondVec);
+      Value *True = vectorizeTree(S.OpValue, TrueVec);
+      Value *False = vectorizeTree(S.OpValue, FalseVec);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3293,8 +3428,8 @@
 
       setInsertPointAfterBundle(E->Scalars, S);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(S.OpValue, LHSVL);
+      Value *RHS = vectorizeTree(S.OpValue, RHSVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3373,7 +3508,7 @@
 
       setInsertPointAfterBundle(E->Scalars, S);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues);
+      Value *VecValue = vectorizeTree(S.OpValue, ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
@@ -3404,7 +3539,7 @@
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL);
+      Value *Op0 = vectorizeTree(S.OpValue, Op0VL);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -3413,7 +3548,7 @@
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(S.OpValue, OpVL);
         OpVecs.push_back(OpVec);
       }
 
@@ -3456,7 +3591,7 @@
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(S.OpValue, OpVL);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3497,14 +3632,14 @@
       if (Instruction::isBinaryOp(S.getOpcode())) {
         reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
         setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(LHSVL);
-        RHS = vectorizeTree(RHSVL);
+        LHS = vectorizeTree(S.OpValue, LHSVL);
+        RHS = vectorizeTree(S.OpValue, RHSVL);
       } else {
         ValueList INVL;
         for (Value *V : E->Scalars)
           INVL.push_back(cast<Instruction>(V)->getOperand(0));
         setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(INVL);
+        LHS = vectorizeTree(S.OpValue, INVL);
       }
 
       if (E->VectorizedValue) {
@@ -3574,7 +3709,12 @@
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    // Remove all Schedule Data from all nodes that we have changed
+    // vectorization decision.
+    if (!RemovedOprations.empty())
+      removeFromScheduling(BS, RemovedOprations);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -3702,15 +3842,8 @@
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
-#ifndef NDEBUG
-        for (User *U : Scalar->users()) {
-          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
-          // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
-                 "Replacing out-of-tree value with undef");
-        }
-#endif
+        // The tree might not be fully vectorized, so we don't have to
+        // check every user.
         Value *Undef = UndefValue::get(Ty);
         Scalar->replaceAllUsesWith(Undef);
       }
@@ -4186,6 +4319,33 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::removeFromScheduling(BlockScheduling *BS,
+                                   ArrayRef<int> Oprations) {
+  bool Removed = false;
+  for (int I : Oprations) {
+    TreeEntry *Entry = &VectorizableTree[I];
+    ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]);
+    if (SD && SD->isPartOfBundle()) {
+      if (!Removed) {
+        Removed = true;
+        BS->resetSchedule();
+      }
+      BS->cancelScheduling(Entry->Scalars, SD->OpValue);
+    }
+  }
+  if (Removed) {
+    BS->resetSchedule();
+    BS->initialFillReadyList(BS->ReadyInsts);
+    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+         I = I->getNextNode()) {
+      if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+        continue;
+      BS->doForAllOpcodes(I,
+                          [&](ScheduleData *SD) { SD->clearDependencies(); });
+    }
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4682,7 +4842,16 @@
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
-  const unsigned Sz = R.getVectorElementSize(Chain[0]);
+  Value *FirstStore = nullptr;
+  for (Value *V : Chain) {
+    assert(isa<StoreInst>(V) && "Expected only StoreInst here!");
+    if (StoreInst *SI = cast<StoreInst>(V))
+      if (SI->getValueOperand())
+        FirstStore = V;
+  }
+  if (!FirstStore)
+    return false;
+  const unsigned Sz = R.getVectorElementSize(FirstStore);
   const unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -4703,13 +4872,20 @@
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
+    // Skip if any store instruction vectorized.
+    if (std::any_of(Operands.begin(), Operands.end(),
+          [](Value *V) {
+            return (!(cast<StoreInst>(V))->getValueOperand());
+          }))
+      continue;
+
     R.buildTree(Operands);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
 
     R.computeMinimumValueSizes();
 
-    int Cost = R.getTreeCost();
+    int Cost = R.getTreeCost(true);
 
     LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
                       << "\n");
@@ -4724,6 +4900,7 @@
                        << " and with tree size "
                        << NV("TreeSize", R.getTreeSize()));
 
+      R.reduceTree();
       R.vectorizeTree();
 
       // Move to the next bundle.
Index: test/Transforms/SLPVectorizer/X86/intrinsic.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -147,8 +147,16 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP5]], i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/long_chains.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/long_chains.ll
+++ test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -9,26 +9,27 @@
 define i32 @test(double* nocapture %A, i8* nocapture %B) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> <i8 3, i8 3>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double>
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP0]], 3
+; CHECK-NEXT:    [[ADD4:%.*]] = add i8 [[TMP1]], 3
+; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i8 [[ADD]] to double
+; CHECK-NEXT:    [[CONV7:%.*]] = sitofp i8 [[ADD4]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[CONV6]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV7]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/resched.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/resched.ll
+++ test/Transforms/SLPVectorizer/X86/resched.ll
@@ -12,70 +12,71 @@
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8
+; CHECK-NEXT:    [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+; CHECK-NEXT:    [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+; CHECK-NEXT:    [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+; CHECK-NEXT:    [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+; CHECK-NEXT:    [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
 ; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
 ; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
 ; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[TMP43]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP7]], i32 6
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP8]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP9]], i32 8
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[TMP10]], i32 9
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP11]], i32 10
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP12]], i32 11
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP13]], i32 12
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP14]], i32 13
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP15]], i32 14
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP16]], i32 15
+; CHECK-NEXT:    [[TMP33:%.*]] = and <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[TMP32]]
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP33]], <16 x i8>* [[TMP34]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+@e = common dso_local local_unnamed_addr global double 0.000000e+00, align 8
+@f = common dso_local local_unnamed_addr global i32 0, align 4
+@m = common dso_local local_unnamed_addr global double 0.000000e+00, align 8
+@g = common dso_local local_unnamed_addr global double 0.000000e+00, align 8
+@i = common dso_local local_unnamed_addr global i32 0, align 4
+@n = common dso_local local_unnamed_addr global double 0.000000e+00, align 8
+@h = common dso_local local_unnamed_addr global i32 0, align 4
+@j = common dso_local local_unnamed_addr global i32 0, align 4
+@k = common dso_local local_unnamed_addr global i32 0, align 4
+@c = common dso_local local_unnamed_addr global i32 0, align 4
+@l = common dso_local local_unnamed_addr global i32 0, align 4
+@b = common dso_local local_unnamed_addr global double 0.000000e+00, align 8
+@d = common dso_local local_unnamed_addr global i32 0, align 4
+@a = common dso_local local_unnamed_addr global double* null, align 8
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* @e, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @f, align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], [[CONV]]
+; CHECK-NEXT:    store double [[ADD]], double* @m, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* @g, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* @i, align 4
+; CHECK-NEXT:    [[CONV1:%.*]] = sitofp i32 [[TMP3]] to double
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[TMP2]], [[CONV1]]
+; CHECK-NEXT:    store double [[ADD2]], double* @n, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @h, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* @j, align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store i32 [[ADD3]], i32* @k, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[TMP6]]
+; CHECK-NEXT:    store i32 [[SUB]], i32* @l, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* @b, align 8
+; CHECK-NEXT:    [[CONV4:%.*]] = sitofp i32 [[SUB]] to double
+; CHECK-NEXT:    [[SUB5:%.*]] = fsub double [[TMP7]], [[CONV4]]
+; CHECK-NEXT:    [[CONV6:%.*]] = fptosi double [[SUB5]] to i32
+; CHECK-NEXT:    store i32 [[CONV6]], i32* @c, align 4
+; CHECK-NEXT:    [[CONV7:%.*]] = sitofp i32 [[ADD3]] to double
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP7]], [[CONV7]]
+; CHECK-NEXT:    [[CONV9:%.*]] = fptosi double [[ADD8]] to i32
+; CHECK-NEXT:    store i32 [[CONV9]], i32* @d, align 4
+; CHECK-NEXT:    [[CONV10:%.*]] = sitofp i32 [[CONV6]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = load double*, double** @a, align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[TMP8]], i64 24
+; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[CONV9]] to double
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[ADD]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[ADD2]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> undef, double [[CONV10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[CONV12]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds double, double* [[TMP8]], i64 25
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load double, double* @e, align 8
+  %1 = load i32, i32* @f, align 4
+  %conv = sitofp i32 %1 to double
+  %add = fadd double %0, %conv
+  store double %add, double* @m, align 8
+  %2 = load double, double* @g, align 8
+  %3 = load i32, i32* @i, align 4
+  %conv1 = sitofp i32 %3 to double
+  %add2 = fadd double %2, %conv1
+  store double %add2, double* @n, align 8
+  %4 = load i32, i32* @h, align 4
+  %5 = load i32, i32* @j, align 4
+  %add3 = add nsw i32 %5, %4
+  store i32 %add3, i32* @k, align 4
+  %6 = load i32, i32* @c, align 4
+  %sub = sub nsw i32 0, %6
+  store i32 %sub, i32* @l, align 4
+  %7 = load double, double* @b, align 8
+  %conv4 = sitofp i32 %sub to double
+  %sub5 = fsub double %7, %conv4
+  %conv6 = fptosi double %sub5 to i32
+  store i32 %conv6, i32* @c, align 4
+  %conv7 = sitofp i32 %add3 to double
+  %add8 = fadd double %7, %conv7
+  %conv9 = fptosi double %add8 to i32
+  store i32 %conv9, i32* @d, align 4
+  %conv10 = sitofp i32 %conv6 to double
+  %add11 = fadd double %add, %conv10
+  %8 = load double*, double** @a, align 8
+  %arrayidx = getelementptr inbounds double, double* %8, i64 24
+  store double %add11, double* %arrayidx, align 8
+  %conv12 = sitofp i32 %conv9 to double
+  %add13 = fadd double %add2, %conv12
+  %arrayidx14 = getelementptr inbounds double, double* %8, i64 25
+  store double %add13, double* %arrayidx14, align 8
+  ret i32 undef
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local void @bar(i32 %n, double* nocapture %a, i32 %nc, double* nocapture readonly %c) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[N:%.*]], 1
+; CHECK-NEXT:    [[CMP77:%.*]] = icmp sgt i32 [[N]], 5
+; CHECK-NEXT:    br i1 [[CMP77]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[NC:%.*]], 1
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[MUL]], [[SHR]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[DIV]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[NC]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV81:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT82:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 2, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT82]] = add nsw i64 [[INDVARS_IV81]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[TMP2]], [[INDVARS_IV_NEXT82]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[SUB2:%.*]] = fsub double 5.000000e-01, [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[INDVARS_IV_NEXT82]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[IDXPROM7:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM7]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; CHECK-NEXT:    [[SUB9:%.*]] = fsub double [[TMP11]], [[TMP7]]
+; CHECK-NEXT:    [[ADD13:%.*]] = add nsw i32 [[SUB]], 1
+; CHECK-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[ADD13]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[ARRAYIDX15]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul double [[SUB2]], [[SUB9]]
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double [[TMP6]], [[ADD16]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd double [[MUL17]], [[MUL18]]
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[SUB2]], [[ADD16]]
+; CHECK-NEXT:    [[MUL21:%.*]] = fmul double [[TMP6]], [[SUB9]]
+; CHECK-NEXT:    [[SUB22:%.*]] = fsub double [[MUL20]], [[MUL21]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> [[TMP14]], double [[SUB22]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x double> [[TMP10]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP16]], <2 x double>* [[TMP17]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, double* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd double [[TMP18]], [[ADD19]]
+; CHECK-NEXT:    store double [[ADD32]], double* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, double* [[ARRAYIDX15]], align 8
+; CHECK-NEXT:    [[SUB36:%.*]] = fsub double [[TMP19]], [[SUB22]]
+; CHECK-NEXT:    store double [[SUB36]], double* [[ARRAYIDX15]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %shr = ashr i32 %n, 1
+  %cmp77 = icmp sgt i32 %n, 5
+  br i1 %cmp77, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %mul = shl nsw i32 %nc, 1
+  %div = sdiv i32 %mul, %shr
+  %0 = sext i32 %shr to i64
+  %1 = sext i32 %div to i64
+  %2 = sext i32 %nc to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv81 = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next82, %for.body ]
+  %indvars.iv = phi i64 [ 2, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %3 = trunc i64 %indvars.iv to i32
+  %sub = sub nsw i32 %n, %3
+  %indvars.iv.next82 = add nsw i64 %indvars.iv81, %1
+  %4 = sub nsw i64 %2, %indvars.iv.next82
+  %arrayidx = getelementptr inbounds double, double* %c, i64 %4
+  %5 = load double, double* %arrayidx, align 8
+  %sub2 = fsub double 5.000000e-01, %5
+  %arrayidx4 = getelementptr inbounds double, double* %c, i64 %indvars.iv.next82
+  %6 = load double, double* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  %7 = load double, double* %arrayidx6, align 8
+  %idxprom7 = sext i32 %sub to i64
+  %arrayidx8 = getelementptr inbounds double, double* %a, i64 %idxprom7
+  %8 = load double, double* %arrayidx8, align 8
+  %sub9 = fsub double %7, %8
+  %9 = or i64 %indvars.iv, 1
+  %arrayidx12 = getelementptr inbounds double, double* %a, i64 %9
+  %10 = load double, double* %arrayidx12, align 8
+  %add13 = add nsw i32 %sub, 1
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds double, double* %a, i64 %idxprom14
+  %11 = load double, double* %arrayidx15, align 8
+  %add16 = fadd double %10, %11
+  %mul17 = fmul double %sub2, %sub9
+  %mul18 = fmul double %6, %add16
+  %add19 = fadd double %mul17, %mul18
+  %mul20 = fmul double %sub2, %add16
+  %mul21 = fmul double %6, %sub9
+  %sub22 = fsub double %mul20, %mul21
+  %sub25 = fsub double %7, %add19
+  store double %sub25, double* %arrayidx6, align 8
+  %sub29 = fsub double %10, %sub22
+  store double %sub29, double* %arrayidx12, align 8
+  %12 = load double, double* %arrayidx8, align 8
+  %add32 = fadd double %12, %add19
+  store double %add32, double* %arrayidx8, align 8
+  %13 = load double, double* %arrayidx15, align 8
+  %sub36 = fsub double %13, %sub22
+  store double %sub36, double* %arrayidx15, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}