Index: lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -359,6 +359,9 @@
   /// exclusive, possibly decreasing \p Range.End.
   VPlanPtr buildVPlan(VFRange &Range,
                                     const SmallPtrSetImpl<Value *> &NeedDef);
+
+  VPlanPtr transformVPInstructionsToVPRecipies(VPlanPtr &OriginalPlan,
+                                               VFRange &Range);
 };
 
 } // namespace llvm
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6358,7 +6358,11 @@
 
   // 2. Copy and widen instructions from the old loop into the new loop.
   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
-  VPlans.front()->execute(&State);
+
+  VFRange Range = {BestVF, BestVF + 1};
+  VPlanPtr Widened = transformVPInstructionsToVPRecipies(VPlans.front(), Range);
+
+  Widened->execute(&State);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -6850,6 +6854,15 @@
 LoopVectorizationPlanner::VPlanPtr
 LoopVectorizationPlanner::buildVPlan(VFRange &Range,
                                      const SmallPtrSetImpl<Value *> &NeedDef) {
+  // Create new empty VPlan
+  auto Plan = llvm::make_unique<VPlan>();
+
+  // Build hierarchical CFG
+  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
+  HCFGBuilder.buildHierarchicalCFG(*Plan.get());
+
+  sinkInstructions(Plan, Legal->getSinkAfter());
+
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
@@ -6857,22 +6870,32 @@
   if (!OrigLoop->empty()) {
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
-    // Create new empty VPlan
-    auto Plan = llvm::make_unique<VPlan>();
-
-    // Build hierarchical CFG
-    VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
-    HCFGBuilder.buildHierarchicalCFG(*Plan.get());
 
     return Plan;
   }
-
   assert(OrigLoop->empty() && "Inner loop expected.");
-  EdgeMaskCache.clear();
-  BlockMaskCache.clear();
-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
-  DenseMap<Instruction *, Instruction *> SinkAfterInverse;
 
+  std::string PlanName;
+  raw_string_ostream RSO(PlanName);
+  unsigned VF = Range.Start;
+  Plan->addVF(VF);
+  RSO << "Initial VPlan for VF={" << VF;
+  for (VF *= 2; VF < Range.End; VF *= 2) {
+    Plan->addVF(VF);
+    RSO << "," << VF;
+  }
+  RSO << "},UF>=1";
+  RSO.flush();
+  Plan->setName(PlanName);
+
+  return Plan;
+}
+
+// FIXME: move to LoopVectorizationPlanner.cpp, once LoopVectorizationCodeModel
+// is moved the a header file.
+LoopVectorizationPlanner::VPlanPtr
+LoopVectorizationPlanner::transformVPInstructionsToVPRecipies(
+    VPlanPtr &OriginalPlan, VFRange &Range) {
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
@@ -6891,72 +6914,57 @@
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
   auto Plan = llvm::make_unique<VPlan>(VPBB);
 
-  // Represent values that will have defs inside VPlan.
-  for (Value *V : NeedDef)
-    Plan->addVPValue(V);
+  auto *Latch = OrigLoop->getLoopLatch();
+  for (BasicBlock *BB : OrigLoop->blocks()) {
+    if (BB == Latch)
+      continue;
+    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+    if (Branch && Branch->isConditional())
+      Plan->addVPValue(Branch->getCondition());
+  }
 
-  // Scan the body of the loop in a topological order to visit each basic block
-  // after having visited its predecessor basic blocks.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
+  VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(OriginalPlan->getEntry());
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+  for (VPBlockBase *Base : RPOT) {
+    VPBasicBlock *OriginalVPBB = Base->getEntryBasicBlock();
+    // Skip entry and exit nodes for now. Currently the recipes will take
+    // care of creating instructions in entry and exit blocks.
+    if (TopRegion && (OriginalVPBB == TopRegion->getEntry() ||
+                      OriginalVPBB == TopRegion->getExit()))
+      continue;
 
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    // Relevant instructions from basic block BB will be grouped into VPRecipe
-    // ingredients and fill a new VPBasicBlock.
-    unsigned VPBBsForBB = 0;
-    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+    auto *FirstVPBBForBB = new VPBasicBlock(OriginalVPBB->getName());
     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
     VPBB = FirstVPBBForBB;
     Builder.setInsertPoint(VPBB);
+    unsigned VPBBsForBB = 0;
 
-    std::vector<Instruction *> Ingredients;
+    std::vector<VPRecipeBase *> Ingredients;
 
-    // Organize the ingredients to vectorize from current basic block in the
-    // right order.
-    for (Instruction &I : BB->instructionsWithoutDebug()) {
-      Instruction *Instr = &I;
+    // Introduce each ingredient into VPlan.
+    for (VPRecipeBase &Ingredient : *OriginalVPBB) {
+      VPInstruction *VPInst = dyn_cast<VPInstruction>(&Ingredient);
+      if (!VPInst) {
+        VPBB->appendRecipe(VPInst);
+        continue;
+      }
 
-      // First filter out irrelevant instructions, to ensure no recipes are
-      // built for them.
-      if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
+      assert(VPInst && "Can only handle VPInstructions.");
+      Instruction *Instr = dyn_cast<Instruction>(VPInst->getUnderlyingValue());
+      if (DeadInstructions.count(Instr) || isa<DbgInfoIntrinsic>(Instr))
         continue;
 
-      // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
+      VPRecipeBase *Recipe = nullptr;
+
       // member of the IG, do not construct any Recipe for it.
       const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
       if (IG && Instr != IG->getInsertPos() &&
           Range.Start >= 2 && // Query is illegal for VF == 1
           CM.getWideningDecision(Instr, Range.Start) ==
               LoopVectorizationCostModel::CM_Interleave) {
-        if (SinkAfterInverse.count(Instr))
-          Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
-        continue;
-      }
-
-      // Move instructions to handle first-order recurrences, step 1: avoid
-      // handling this instruction until after we've handled the instruction it
-      // should follow.
-      auto SAIt = SinkAfter.find(Instr);
-      if (SAIt != SinkAfter.end()) {
-        DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
-                     << " to vectorize a 1st order recurrence.\n");
-        SinkAfterInverse[SAIt->second] = Instr;
         continue;
       }
 
-      Ingredients.push_back(Instr);
-
-      // Move instructions to handle first-order recurrences, step 2: push the
-      // instruction to be sunk at its insertion point.
-      auto SAInvIt = SinkAfterInverse.find(Instr);
-      if (SAInvIt != SinkAfterInverse.end())
-        Ingredients.push_back(SAInvIt->second);
-    }
-
-    // Introduce each ingredient into VPlan.
-    for (Instruction *Instr : Ingredients) {
-      VPRecipeBase *Recipe = nullptr;
-
       // Check if Instr should belong to an interleave memory recipe, or already
       // does. In the latter case Instr is irrelevant.
       if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
@@ -6996,8 +7004,7 @@
         handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
       if (NextVPBB != VPBB) {
         VPBB = NextVPBB;
-        VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
-                                    : "");
+        VPBB->setName(VPBB->getName() + "." + Twine(VPBBsForBB++));
       }
     }
   }
Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -18,8 +18,8 @@
 ; Cost of udiv:
 ;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
 ;
-; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
 entry:
@@ -59,8 +59,8 @@
 ; Cost of store:
 ;   (store(4) + extractelement(3)) / 2 = 3
 ;
-; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 ;
 define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -98,10 +98,10 @@
 ; Cost of udiv:
 ;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
 ;
-; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
-; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -143,10 +143,10 @@
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
-; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
-; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 ;
 define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -192,16 +192,16 @@
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
-; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
-; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
-; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
-; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
 ; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
 ; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
 ;
 define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
Index: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
===================================================================
--- test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
+++ test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
@@ -24,10 +24,10 @@
 for.end:
   ret void
 
-; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
-; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
-
 ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
 ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
 }