Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -186,6 +186,15 @@
   }
 };
 
+/// A pair of VPlan and VectorizationFactor, used as the best result of costing
+/// different VPlans.
+struct VPlanVFPair {
+  /// The Plan
+  VPlan *Plan;
+  /// The VF/Cost from costing
+  VectorizationFactor VF;
+};
+
 /// Planner drives the vectorization process after having passed
 /// Legality checks.
 class LoopVectorizationPlanner {
@@ -247,14 +256,14 @@
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
+  Optional<VPlanVFPair> plan(ElementCount UserVF, unsigned UserIC);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
+  VPlanVFPair planInVPlanNativePath(ElementCount UserVF);
 
   /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(ElementCount VF, unsigned UF);
+  void setBestPlan(VPlan *Plan, ElementCount VF, unsigned UF);
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected VPlan.
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -317,6 +317,10 @@
     cl::desc("Enable VPlan-native vectorization path predicator with "
              "support for outer loop vectorization."));
 
+cl::opt<bool> CostUsingVPlan("cost-using-vplan", cl::init(false), cl::Hidden,
+                             cl::desc("Enable VPlan based costing path. To "
+                                      "become the default in the future."));
+
 // This flag enables the stress testing of the VPlan H-CFG construction in the
 // VPlan-native vectorization path. It must be used in conjuction with
 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
@@ -1077,6 +1081,11 @@
   /// possible.
   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
 
+  /// \return The most profitable vplan and VF from a list of VPlans.
+  VPlanVFPair
+  selectVectorizationFactorFromVPlans(SmallVectorImpl<VPlanPtr> &VPlans,
+                                      unsigned MaxVF);
+
   /// Setup cost-based decisions for user vectorization factor.
   void selectUserVectorizationFactor(ElementCount UserVF) {
     collectUniformsAndScalars(UserVF);
@@ -1092,7 +1101,8 @@
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
+  unsigned selectInterleaveCount(VPlan *Plan, ElementCount VF,
+                                 unsigned LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -1436,6 +1446,10 @@
     Scalars.clear();
   }
 
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1444,25 +1458,12 @@
   /// to cost.
   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
 
-  /// The vectorization cost is a combination of the cost itself and a boolean
-  /// indicating whether any of the contributing operations will actually
-  /// operate on
-  /// vector values after type legalization in the backend. If this latter value
-  /// is
-  /// false, then all operations will be scalarized (i.e. no vectorization has
-  /// actually taken place).
-  using VectorizationCostTy = std::pair<unsigned, bool>;
-
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
   VectorizationCostTy expectedCost(ElementCount VF);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
@@ -5451,6 +5452,74 @@
   return Factor;
 }
 
+VPlanVFPair LoopVectorizationCostModel::selectVectorizationFactorFromVPlans(
+    SmallVectorImpl<VPlanPtr> &VPlans, unsigned MaxVF) {
+  VPCostContext Ctx{*this, *Legal};
+  bool ForceVectorization =
+      Hints->getForce() == LoopVectorizeHints::FK_Enabled && MaxVF > 1;
+
+  VPlan *BestPlan = nullptr, *ScalarPlan = nullptr;
+  ElementCount BestVF = ElementCount::getNull();
+  float BestCost, ScalarCost;
+  for (const auto &Plan : VPlans) {
+    for (ElementCount VF : Plan->getVFs()) {
+
+      if (ForceVectorization && VF.isScalar()) {
+        LLVM_DEBUG(dbgs() << "  Skipping due to force vectorization\n");
+        continue;
+      }
+      if (VF.getKnownMinValue() > MaxVF) {
+        LLVM_DEBUG(dbgs() << "  Skipping due to MaxVF\n");
+        continue;
+      }
+
+      VectorizationCostTy Cost = Plan->cost(VF, Ctx);
+      float VectorCost = Cost.first / (float)VF.getKnownMinValue();
+      LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF.getKnownMinValue()
+                        << " costs: " << (int)VectorCost << ".\n");
+      if (!VF.isScalar() && !Cost.second && !ForceVectorization) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width "
+            << VF.getKnownMinValue()
+            << " because it will not generate any vector instructions.\n");
+        continue;
+      }
+
+      if (!BestPlan || VectorCost < BestCost) {
+        BestPlan = &*Plan;
+        BestVF = VF;
+        BestCost = VectorCost;
+      }
+      if (!ScalarPlan && VF.isScalar()) {
+        ScalarPlan = &*Plan;
+        ScalarCost = VectorCost;
+      }
+    }
+  }
+
+  if (!EnableCondStoresVectorization && NumPredStores) {
+    reportVectorizationFailure("There are conditional stores.",
+        "store that is conditionally executed prevents vectorization",
+        "ConditionalStore", ORE, TheLoop);
+    BestPlan = ScalarPlan;
+    BestVF = ElementCount::getFixed(1);
+    BestCost = ScalarCost;
+  }
+
+  if (!BestPlan) {
+    assert(ScalarPlan);
+    BestPlan = ScalarPlan;
+    BestVF = ElementCount::getFixed(1);
+    BestCost = ScalarCost;
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestVF << ".\n");
+  VectorizationFactor Factor = {
+      BestVF, (unsigned)(BestCost * BestVF.getKnownMinValue())};
+  return {BestPlan, Factor};
+}
+
 std::pair<unsigned, unsigned>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
@@ -5507,7 +5576,8 @@
   return {MinWidth, MaxWidth};
 }
 
-unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(VPlan *Plan,
+                                                           ElementCount VF,
                                                            unsigned LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -5612,8 +5682,13 @@
 
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
-  if (LoopCost == 0)
-    LoopCost = expectedCost(VF).first;
+  if (LoopCost == 0) {
+    if (CostUsingVPlan) {
+      VPCostContext Ctx{*this, *Legal};
+      LoopCost = Plan->cost(VF, Ctx).first;
+    } else
+      LoopCost = expectedCost(VF).first;
+  }
 
   assert(LoopCost && "Non-zero loop cost expected");
 
@@ -6068,8 +6143,7 @@
   return Discount;
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(ElementCount VF) {
+VectorizationCostTy LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   assert(!VF.isScalable() && "scalable vectors not yet supported.");
   VectorizationCostTy Cost;
 
@@ -6314,7 +6388,7 @@
   return getWideningCost(I, VF);
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
+VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
   assert(!VF.isScalable() &&
@@ -6927,7 +7001,7 @@
   return WidestVectorRegBits / WidestType;
 }
 
-VectorizationFactor
+VPlanVFPair
 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
@@ -6959,19 +7033,20 @@
 
     // For VPlan build stress testing, we bail out after VPlan construction.
     if (VPlanBuildStressTest)
-      return VectorizationFactor::Disabled();
+      return {nullptr, VectorizationFactor::Disabled()};
 
-    return {VF, 0 /*Cost*/};
+    assert(VPlans.size() == 1 && "Expected a single vplan!");
+    return {&*VPlans.front(), {VF, 0 /*Cost*/}};
   }
 
   LLVM_DEBUG(
       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
                 "VPlan-native path.\n");
-  return VectorizationFactor::Disabled();
+  return {nullptr, VectorizationFactor::Disabled()};
 }
 
-Optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+Optional<VPlanVFPair> LoopVectorizationPlanner::plan(ElementCount UserVF,
+                                                     unsigned UserIC) {
   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<unsigned> MaybeMaxVF =
@@ -7004,7 +7079,9 @@
     buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
                              UserVF.getKnownMinValue());
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    assert(VPlans.size() == 1 && VPlans.front()->hasVF(UserVF) &&
+           "Expected a correct width vplan!");
+    return VPlanVFPair{&*VPlans.front(), {UserVF, 0}};
   }
 
   unsigned MaxVF = MaybeMaxVF.getValue();
@@ -7024,22 +7101,38 @@
 
   buildVPlansWithVPRecipes(1, MaxVF);
   LLVM_DEBUG(printPlans(dbgs()));
-  if (MaxVF == 1)
-    return VectorizationFactor::Disabled();
+  if (MaxVF == 1) {
+    assert(VPlans.size() == 1 &&
+           VPlans.front()->hasVF(ElementCount::getFixed(MaxVF)));
+    return VPlanVFPair{&*VPlans.front(), VectorizationFactor::Disabled()};
+  }
 
   // Select the optimal vectorization factor.
-  return CM.selectVectorizationFactor(MaxVF);
+  if (CostUsingVPlan)
+    return CM.selectVectorizationFactorFromVPlans(VPlans, MaxVF);
+  else {
+    VectorizationFactor VF = CM.selectVectorizationFactor(MaxVF);
+    for (VPlanPtr &Plan : VPlans)
+      if (Plan->hasVF(VF.Width))
+        return VPlanVFPair{&*Plan, VF};
+    llvm_unreachable("Expected to find a vplan with width VF!");
+  }
 }
 
-void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(VPlan *Plan, ElementCount VF,
+                                           unsigned UF) {
   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
                     << '\n');
   BestVF = VF;
   BestUF = UF;
 
-  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
-    return !Plan->hasVF(VF);
-  });
+  if (!Plan) {
+    // No best.
+    VPlans.clear();
+    return;
+  }
+
+  erase_if(VPlans, [Plan](const VPlanPtr &P) { return &*P != Plan; });
   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
 }
 
@@ -7528,6 +7621,7 @@
   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
   VPBlockUtils::insertBlockAfter(Region, VPBB);
   auto *RegSucc = new VPBasicBlock();
+  RegSucc->setReciprocalPredBlockProb(getReciprocalPredBlockProb());
   VPBlockUtils::insertBlockAfter(RegSucc, Region);
   return RegSucc;
 }
@@ -7546,10 +7640,13 @@
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+  Entry->setReciprocalPredBlockProb(Builder.getInsertBlock()->getReciprocalPredBlockProb());
   auto *PHIRecipe =
       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  Exit->setReciprocalPredBlockProb(getReciprocalPredBlockProb());
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+  Pred->setReciprocalPredBlockProb(getReciprocalPredBlockProb());
   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
 
   // Note: first set Entry as region entry and then connect successors starting
@@ -7718,6 +7815,11 @@
     VPBB = FirstVPBBForBB;
     Builder.setInsertPoint(VPBB);
 
+    // Update the ReciprocalPredBlockProb of the block, used in costing.
+    // FIXME: This is not very accurate, and could be improved / replaced.
+    if (CM.blockNeedsPredication(BB))
+      VPBB->setReciprocalPredBlockProb(getReciprocalPredBlockProb());
+
     // Introduce each ingredient into VPlan.
     // TODO: Model and preserve debug instrinsics in VPlan.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -7940,6 +8042,90 @@
   return ILV.getOrCreateScalarValue(V, Instance);
 }
 
+VectorizationCostTy VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
+  VectorizationCostTy Cost;
+
+  for (VPBlockBase *Block : depth_first(Entry)) {
+    VectorizationCostTy C = Block->cost(VF, Ctx);
+
+    Cost.first += C.first;
+    Cost.second |= C.second;
+  }
+
+  // The vplan does not contain the add+icmp for the loop iteration check. Add
+  // those costs here.
+  unsigned ExtraCost =
+      Ctx.CM.TTI.getArithmeticInstrCost(Instruction::Add,
+                                        Ctx.Legal.getWidestInductionType()) +
+      Ctx.CM.TTI.getCmpSelInstrCost(Instruction::ICmp,
+                                    Ctx.Legal.getWidestInductionType());
+  Cost.first += ExtraCost;
+  LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << ExtraCost
+                    << " for VF " << VF
+                    << " For loop induction check (add + icmp)\n");
+  // And then add the cost of the backedge, which is often but not always 0.
+  ExtraCost =
+      Ctx.CM.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+  Cost.first += ExtraCost;
+  LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << ExtraCost
+                    << " for VF " << VF
+                    << " For loop backedge cost (br)\n");
+
+  return Cost;
+}
+
+VectorizationCostTy VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+  ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
+  VectorizationCostTy Cost;
+
+  for (VPBlockBase *Block : RPOT) {
+    VectorizationCostTy C = Block->cost(VF, Ctx);
+
+    Cost.first += C.first;
+    Cost.second |= C.second;
+  }
+
+  return Cost;
+}
+
+VectorizationCostTy VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+  VectorizationCostTy BlockCost;
+  VPSlotTracker Tracker(getPlan());
+
+  for (VPRecipeBase &Recipe : Recipes) {
+    // Skip ignored values.
+    // FIXME: This should go via VPValues getUnderlyingValue.
+    VPValue *Val = Recipe.toVPValue();
+    if (Val && (Ctx.CM.ValuesToIgnore.count(Val->getUnderlyingValue()) ||
+                (VF.isVector() &&
+                 Ctx.CM.VecValuesToIgnore.count(Val->getUnderlyingValue()))))
+      continue;
+
+    VectorizationCostTy C = Recipe.cost(VF, Ctx);
+
+    // Check if we should override the cost.
+    if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+      C.first = ForceTargetInstructionCost;
+
+    BlockCost.first += C.first;
+    BlockCost.second |= C.second;
+    LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+                      << " for VF " << VF << " For recipe: ";
+               Recipe.print(dbgs(), "", Tracker); dbgs() << '\n');
+  }
+
+  // If we are vectorizing a predicated block, it will have been
+  // if-converted. This means that the block's instructions (aside from
+  // stores and instructions that may divide by zero) will now be
+  // unconditionally executed. For the scalar case, we may not always execute
+  // the predicated block. Thus, scale the block's cost by the probability of
+  // executing it.
+  if (VF.isScalar())
+    BlockCost.first /= getReciprocalPredBlockProb();
+
+  return BlockCost;
+}
+
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
@@ -7960,28 +8146,57 @@
   State.ILV->widenCallInstruction(Ingredient, *this, State);
 }
 
+VectorizationCostTy VPWidenCallRecipe::cost(ElementCount VF,
+                                            VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(&Ingredient, VF);
+}
+
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
   State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State);
 }
 
+VectorizationCostTy VPWidenSelectRecipe::cost(ElementCount VF,
+                                              VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(&Ingredient, VF);
+}
+
 void VPWidenRecipe::execute(VPTransformState &State) {
   State.ILV->widenInstruction(*getUnderlyingInstr(), *this, State);
 }
 
+VectorizationCostTy VPWidenRecipe::cost(ElementCount VF, VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(getUnderlyingInstr(), VF);
+}
+
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
   State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant,
                       IsIndexLoopInvariant, State);
 }
 
+VectorizationCostTy VPWidenGEPRecipe::cost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(GEP, VF);
+}
+
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
   State.ILV->widenIntOrFpInduction(IV, Trunc);
 }
 
+VectorizationCostTy VPWidenIntOrFpInductionRecipe::cost(ElementCount VF,
+                                                        VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(IV, VF);
+}
+
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
 }
 
+VectorizationCostTy VPWidenPHIRecipe::cost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(Phi, VF);
+}
+
 void VPBlendRecipe::execute(VPTransformState &State) {
   State.ILV->setDebugLocFromInst(State.Builder, Phi);
   // We know that all PHIs in non-header blocks are converted into
@@ -8021,11 +8236,28 @@
     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
 }
 
+VectorizationCostTy VPBlendRecipe::cost(ElementCount VF, VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(Phi, VF);
+}
+
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
 }
 
+VectorizationCostTy VPInterleaveRecipe::cost(ElementCount VF,
+                                             VPCostContext &Ctx) {
+  VectorizationCostTy Cost = {0, false};
+  for (unsigned i = 0; i < IG->getNumMembers(); i++) {
+    if (!IG->getMember(i))
+      continue;
+    VectorizationCostTy MC = Ctx.CM.getInstructionCost(IG->getMember(i), VF);
+    Cost.first += MC.first;
+    Cost.second |= MC.second;
+  }
+  return Cost;
+}
+
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
   for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -8059,6 +8291,15 @@
   }
 }
 
+VectorizationCostTy VPReductionRecipe::cost(ElementCount VF,
+                                            VPCostContext &Ctx) {
+  unsigned Cost = Ctx.CM.TTI.getArithmeticReductionCost(
+      RdxDesc->getRecurrenceBinOp(),
+      VectorType::get(RdxDesc->getRecurrenceType(), VF), false,
+      TTI::TCK_RecipThroughput);
+  return {Cost, false};
+}
+
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
     State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance,
@@ -8087,6 +8328,11 @@
                                       IsPredicated, State);
 }
 
+VectorizationCostTy VPReplicateRecipe::cost(ElementCount VF,
+                                            VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(Ingredient, VF);
+}
+
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Branch on Mask works only on single instance.");
 
@@ -8113,6 +8359,28 @@
   ReplaceInstWithInst(CurrentTerminator, CondBr);
 }
 
+VectorizationCostTy VPBranchOnMaskRecipe::cost(ElementCount VF,
+                                               VPCostContext &Ctx) {
+  // In cases of scalarized and predicated instructions, there will be VF
+  // predicated blocks in the vectorized loop. Each branch around these
+  // blocks requires also an extract of its vector compare i1 element.
+  if (VF.isVector()) {
+    // Return cost for branches around scalarized and predicated blocks.
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    LLVMContext &C = Ctx.CM.TheLoop->getHeader()->getContext();
+    auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(C), VF);
+    unsigned Cost =
+        Ctx.CM.TTI.getScalarizationOverhead(
+            Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), false,
+            true) +
+        (Ctx.CM.TTI.getCFInstrCost(Instruction::Br,
+                                   TargetTransformInfo::TCK_RecipThroughput) *
+         VF.getKnownMinValue());
+    return {Cost, false};
+  }
+  return {0, false};
+}
+
 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Predicated instruction PHI works per instance.");
   Instruction *ScalarPredInst = cast<Instruction>(
@@ -8144,6 +8412,11 @@
   }
 }
 
+VectorizationCostTy VPPredInstPHIRecipe::cost(ElementCount VF,
+                                              VPCostContext &Ctx) {
+  return { 0, false };
+}
+
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   Instruction *Instr = getUnderlyingInstr();
   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
@@ -8152,6 +8425,24 @@
                                         StoredValue, getMask());
 }
 
+VectorizationCostTy VPWidenMemoryInstructionRecipe::cost(ElementCount VF,
+                                                         VPCostContext &Ctx) {
+  return Ctx.CM.getInstructionCost(getUnderlyingInstr(), VF);
+}
+
+VectorizationCostTy VPWidenCanonicalIVRecipe::cost(ElementCount VF,
+                                                   VPCostContext &Ctx) {
+  return {Ctx.CM.TTI.getCFInstrCost(Instruction::PHI,
+                                    TargetTransformInfo::TCK_RecipThroughput),
+          false};
+}
+
+VectorizationCostTy VPInstruction::cost(ElementCount VF, VPCostContext &Ctx) {
+  // FIXME: Cost everything that a VPInstruction can be, which likely needs type
+  // information.
+  return {0, false};
+}
+
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
 // predication, and 4) a TTI hook that analyses whether the loop is suitable
@@ -8234,20 +8525,19 @@
   const unsigned UserVF = Hints.getWidth();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF =
-      LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
+  auto PlanVF = LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
 
   // If we are stress testing VPlan builds, do not attempt to generate vector
   // code. Masked vector code generation support will follow soon.
   // Also, do not attempt to vectorize if no vector code will be produced.
   if (VPlanBuildStressTest || EnableVPlanPredication ||
-      VectorizationFactor::Disabled() == VF)
+      VectorizationFactor::Disabled() == PlanVF.VF)
     return false;
 
-  LVP.setBestPlan(VF.Width, 1);
+  LVP.setBestPlan(PlanVF.Plan, PlanVF.VF.Width, 1);
 
-  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                         &CM, BFI, PSI);
+  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, PlanVF.VF.Width, 1,
+                         LVL, &CM, BFI, PSI);
   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                     << L->getHeader()->getParent()->getName() << "\"\n");
   LVP.executePlan(LB, DT);
@@ -8401,16 +8691,18 @@
   unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF =
+  Optional<VPlanVFPair> MaybeVF =
       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
 
+  VPlan *BestPlan = nullptr;
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
   if (MaybeVF) {
-    VF = *MaybeVF;
+    BestPlan = (*MaybeVF).Plan;
+    VF = (*MaybeVF).VF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(BestPlan, VF.Width, VF.Cost);
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -8502,7 +8794,7 @@
     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
 
-  LVP.setBestPlan(VF.Width, IC);
+  LVP.setBestPlan(BestPlan, VF.Width, IC);
 
   using namespace ore;
   bool DisableRuntimeUnroll = false;
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -59,6 +59,8 @@
 class VPRegionBlock;
 class VPlan;
 class VPlanSlp;
+class LoopVectorizationCostModel;
+class LoopVectorizationLegality;
 
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
@@ -87,6 +89,13 @@
   unsigned Lane;
 };
 
+/// The vectorization cost is a combination of the cost itself and a boolean
+/// indicating whether any of the contributing operations will actually
+/// operate on vector values after type legalization in the backend. If this
+/// latter value is false, then all operations will be scalarized (i.e. no
+/// vectorization has actually taken place).
+using VectorizationCostTy = std::pair<unsigned, bool>;
+
 /// This is a helper struct for maintaining vectorization state. It's used for
 /// mapping values from the original loop to their corresponding values in
 /// the new loop. Two mappings are maintained: one for vectorized values and
@@ -358,6 +367,16 @@
   VPCallback &Callback;
 };
 
+/// A struct to hold the context used during cost calculations. Currently just
+/// holds the CostModel and Legality pointers, which can be expanded as needed.
+struct VPCostContext {
+  /// The original CostModel, which is currently used for getting instruction
+  /// cost.
+  LoopVectorizationCostModel &CM;
+  /// The Legality analysis.
+  LoopVectorizationLegality &Legal;
+};
+
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
 class VPBlockBase {
@@ -583,6 +602,8 @@
   /// VPBlockBase, thereby "executing" the VPlan.
   virtual void execute(struct VPTransformState *State) = 0;
 
+  virtual VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) = 0;
+
   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
   static void deleteCFG(VPBlockBase *Entry);
 
@@ -654,6 +675,8 @@
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(struct VPTransformState &State) = 0;
 
+  virtual VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) = 0;
+
   /// Each recipe prints itself.
   virtual void print(raw_ostream &O, const Twine &Indent,
                      VPSlotTracker &SlotTracker) const = 0;
@@ -776,6 +799,8 @@
   /// provided.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the Recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -836,6 +861,8 @@
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -861,6 +888,8 @@
   /// Produce a widened version of the call instruction.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -892,6 +921,8 @@
   /// Produce a widened version of the select instruction.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -930,6 +961,8 @@
   /// Generate the gep nodes.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -955,6 +988,8 @@
   /// needed by their users.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -976,6 +1011,8 @@
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1016,6 +1053,8 @@
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1055,6 +1094,8 @@
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1097,6 +1138,8 @@
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1154,6 +1197,8 @@
   /// the \p State.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   void setAlsoPack(bool Pack) { AlsoPack = Pack; }
 
   /// Print the recipe.
@@ -1178,10 +1223,12 @@
   /// conditional branch.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override {
-    O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+    O << Indent << "\"BRANCH-ON-MASK ";
     if (VPValue *Mask = getMask())
       Mask->print(O, SlotTracker);
     else
@@ -1221,6 +1268,8 @@
   /// Generates phi nodes for live-outs as needed to retain SSA form.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1289,6 +1338,8 @@
   /// Generate the wide load/store.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1318,6 +1369,8 @@
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
@@ -1334,6 +1387,8 @@
   /// The VPRecipes held in the order of output instructions to generate.
   RecipeListTy Recipes;
 
+  unsigned ReciprocalPredBlockProb = 1;
+
 public:
   VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
       : VPBlockBase(VPBasicBlockSC, Name.str()) {
@@ -1404,6 +1459,13 @@
   /// Return the position of the first non-phi node recipe in the block.
   iterator getFirstNonPhi();
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
+
+  unsigned getReciprocalPredBlockProb() const {
+    return ReciprocalPredBlockProb;
+  }
+  void setReciprocalPredBlockProb(unsigned V) { ReciprocalPredBlockProb = V; }
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1490,6 +1552,8 @@
   /// The method which generates the output IR instructions that correspond to
   /// this VPRegionBlock, thereby "executing" the VPlan.
   void execute(struct VPTransformState *State) override;
+
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1679,6 +1743,8 @@
   /// Generate the IR code for this VPlan.
   void execute(struct VPTransformState *State);
 
+  VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx);
+
   VPBlockBase *getEntry() { return Entry; }
   const VPBlockBase *getEntry() const { return Entry; }
 
@@ -1695,6 +1761,8 @@
     return BackedgeTakenCount;
   }
 
+  const SmallSetVector<ElementCount, 2> &getVFs() const { return VFs; }
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
Index: llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s 
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,11 +11,16 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i32_interleave4() {
 ;CHECK-LABEL: load_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 22 for VF 16 For instruction:   %0 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load
+;CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 22 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4
 entry:
   br label %for.body
 
@@ -46,11 +52,16 @@
 
 define void @load_i32_interleave5() {
 ;CHECK-LABEL: load_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
+;CHECK-CM: Found an estimated cost of 35 for VF 16 For instruction:   %0 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load
+;CHECK-VP: Found an estimated cost of 6 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 9 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 18 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 35 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5
 entry:
   br label %for.body
 
Index: llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll
+++ llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,11 +11,16 @@
 ; Function Attrs: nounwind uwtable
 define void @store_i32_interleave4() {
 ;CHECK-LABEL: store_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %add16
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add16
+;CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %add16
+;CHECK-CM: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %add16
+;CHECK-CM: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %add16
+;CHECK-CM: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %add16
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %add16
+;CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 11 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: Found an estimated cost of 22 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4
 entry:
   br label %for.body
 
@@ -46,11 +52,16 @@
 
 define void @store_i32_interleave5() {
 ;CHECK-LABEL: store_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 21 for VF 8 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   store i32 %add22
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add22
+;CHECK-CM: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %add22
+;CHECK-CM: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %add22
+;CHECK-CM: Found an estimated cost of 21 for VF 8 For instruction:   store i32 %add22
+;CHECK-CM: Found an estimated cost of 35 for VF 16 For instruction:   store i32 %add22
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %add22
+;CHECK-VP: Found an estimated cost of 7 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 14 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 21 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5
+;CHECK-VP: Found an estimated cost of 35 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5
 entry:
   br label %for.body
 
Index: llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll
+++ llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
 
@@ -10,7 +11,8 @@
 define void @stride8(float %k, i32 %width_) {
 entry:
 
-; CHECK: Found an estimated cost of 48 for VF 8 For instruction:   %0 = load float
+; CHECK-CM: Found an estimated cost of 48 for VF 8 For instruction:   %0 = load float
+; CHECK-VP: Found an estimated cost of 48 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 8
 
   %cmp72 = icmp sgt i32 %width_, 0
   br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
@@ -98,7 +100,8 @@
 define void @stride3(float %k, i32 %width_) {
 entry:
 
-; CHECK: Found an estimated cost of 20 for VF 8 For instruction:   %0 = load float
+; CHECK-CM: Found an estimated cost of 20 for VF 8 For instruction:   %0 = load float
+; CHECK-VP: Found an estimated cost of 20 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3
 
   %cmp27 = icmp sgt i32 %width_, 0
   br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
Index: llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll
+++ llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -1,17 +1,24 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=false --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=true --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind readonly uwtable
 define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels)  {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 13 for VF 16 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction:   %0 = load i8
+;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load
+;CHECK-VP: LV: Found an estimated cost of 11 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 10 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3
 entry:
   %cmp13 = icmp sgt i32 %Nels, 0
   br i1 %cmp13, label %for.body.preheader, label %for.end
@@ -50,12 +57,18 @@
 
 ; Function Attrs: norecurse nounwind readonly uwtable
 define i32 @doit_stride4(i8* nocapture readonly %Ptr, i32 %Nels) local_unnamed_addr {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 21 for VF 8 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 41 for VF 16 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 84 for VF 32 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 13 for VF 2 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 21 for VF 8 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 41 for VF 16 For instruction:   %0 = load i8
+;CHECK-CM: LV: Found an estimated cost of 84 for VF 32 For instruction:   %0 = load i8
+;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load
+;CHECK-VP: LV: Found an estimated cost of 13 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 21 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 41 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 84 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4
 entry:
   %cmp59 = icmp sgt i32 %Nels, 0
   br i1 %cmp59, label %for.body.preheader, label %for.end
Index: llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll
+++ llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
 
@@ -10,8 +11,10 @@
 define void @stride2double(double %k, i32 %width_) {
 entry:
 
-; CHECK: Found an estimated cost of 8 for VF 4 For instruction:   %0 = load double
-; CHECK: Found an estimated cost of 8 for VF 4 For instruction:   store double
+; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction:   %0 = load double
+; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction:   store double
+; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %0
+; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2
 
   %cmp27 = icmp sgt i32 %width_, 0
   br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
Index: llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll
+++ llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 %s 2>&1 | FileCheck %s
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
 
@@ -10,8 +11,10 @@
 define void @stride2i64(i64 %k, i32 %width_) {
 entry:
 
-; CHECK: Found an estimated cost of 8 for VF 4 For instruction:   %0 = load i64
-; CHECK: Found an estimated cost of 8 for VF 4 For instruction:   store i64
+; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction:   %0 = load i64
+; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction:   store i64
+; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %0
+; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2
 
   %cmp27 = icmp sgt i32 %width_, 0
   br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
Index: llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll
+++ llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll
@@ -1,17 +1,24 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=false --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=true --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
 define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %conv4
+;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv4
+;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %conv
+;CHECK-VP: LV: Found an estimated cost of 8 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 9 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 12 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3
+;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3
 entry:
   %cmp14 = icmp sgt i32 %Nels, 0
   br i1 %cmp14, label %for.body.lr.ph, label %for.end
@@ -44,12 +51,18 @@
 
 ; Function Attrs: norecurse nounwind uwtable
 define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i8 %conv7
+;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv7
+;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %conv
+;CHECK-VP: LV: Found an estimated cost of 13 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 10 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 11 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 12 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4
+;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4
 entry:
   %cmp19 = icmp sgt i32 %Nels, 0
   br i1 %cmp19, label %for.body.lr.ph, label %for.end
Index: llvm/test/Analysis/CostModel/X86/strided-load-i16.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/strided-load-i16.ll
+++ llvm/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,12 +11,18 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i16_stride2() {
 ;CHECK-LABEL: load_i16_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
 entry:
   br label %for.body
 
@@ -36,12 +43,18 @@
 
 define void @load_i16_stride3() {
 ;CHECK-LABEL: load_i16_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 5 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
 entry:
   br label %for.body
 
@@ -62,12 +75,18 @@
 
 define void @load_i16_stride4() {
 ;CHECK-LABEL: load_i16_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 8 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
 entry:
   br label %for.body
 
@@ -88,12 +107,18 @@
 
 define void @load_i16_stride5() {
 ;CHECK-LABEL: load_i16_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 10 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 10 for VF 32 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 5 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 10 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
 entry:
   br label %for.body
 
Index: llvm/test/Analysis/CostModel/X86/strided-load-i32.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/strided-load-i32.ll
+++ llvm/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,11 +11,16 @@
 ; Function Attrs: nounwind uwtable
 define void @load_int_stride2() {
 ;CHECK-LABEL: load_int_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
 entry:
   br label %for.body
 
@@ -35,11 +41,16 @@
 
 define void @load_int_stride3() {
 ;CHECK-LABEL: load_int_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
 entry:
   br label %for.body
 
@@ -60,11 +71,16 @@
 
 define void @load_int_stride4() {
 ;CHECK-LABEL: load_int_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 5 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
 entry:
   br label %for.body
 
@@ -85,11 +101,16 @@
 
 define void @load_int_stride5() {
 ;CHECK-LABEL: load_int_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 6 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
 entry:
   br label %for.body
 
Index: llvm/test/Analysis/CostModel/X86/strided-load-i64.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/strided-load-i64.ll
+++ llvm/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,10 +11,14 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i64_stride2() {
 ;CHECK-LABEL: load_i64_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
 entry:
   br label %for.body
 
@@ -34,10 +39,14 @@
 
 define void @load_i64_stride3() {
 ;CHECK-LABEL: load_i64_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
 entry:
   br label %for.body
 
@@ -58,10 +67,14 @@
 
 define void @load_i64_stride4() {
 ;CHECK-LABEL: load_i64_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 5 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
 entry:
   br label %for.body
 
Index: llvm/test/Analysis/CostModel/X86/strided-load-i8.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/strided-load-i8.ll
+++ llvm/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,13 +11,20 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i8_stride2() {
 ;CHECK-LABEL: load_i8_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 4 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 4 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 8 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
+;CHECK-VP: Found an estimated cost of 20 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 2 at %1
 entry:
   br label %for.body
 
@@ -37,13 +45,20 @@
 
 define void @load_i8_stride3() {
 ;CHECK-LABEL: load_i8_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 4 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 13 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 16 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 25 for VF 64 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 4 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 13 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 16 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 25 for VF 64 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
+;CHECK-VP: Found an estimated cost of 25 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 3 at %1
 entry:
   br label %for.body
 
@@ -64,13 +79,20 @@
 
 define void @load_i8_stride4() {
 ;CHECK-LABEL: load_i8_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 4 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 4 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 20 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
+;CHECK-VP: Found an estimated cost of 59 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 4 at %1
 entry:
   br label %for.body
 
@@ -91,13 +113,20 @@
 
 define void @load_i8_stride5() {
 ;CHECK-LABEL: load_i8_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 4 for VF 4 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
+;CHECK-CM: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load
+;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 20 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 39 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
+;CHECK-VP: Found an estimated cost of 78 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 5 at %1
 entry:
   br label %for.body
 
Index: llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize -cost-using-vplan=false 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize -cost-using-vplan=true 2>&1 | FileCheck %s --check-prefix=COST-VPLAN
 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -13,6 +14,8 @@
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
 ; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+; COST-VPLAN-LABEL:  predicated_udiv_scalarized_operand
+; COST-VPLAN:        LV: Found an estimated cost of 4 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3 (S->V)
 ;
 ; CHECK-LABEL: @predicated_udiv_scalarized_operand(
 ; CHECK:       vector.body:
Index: llvm/test/Transforms/LoopVectorize/AArch64/costmodel.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/costmodel.ll
@@ -0,0 +1,217 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -cost-using-vplan=false -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -loop-vectorize -cost-using-vplan=true -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; This is a series of test cases that show potential differences between the
+; old cost model and the vplan version. The score are not necessarily precise,
+; but just to show differences not tested elsewhere.
+
+; CHECK-LABEL: predicated_store
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction:   %0 = load i32, i32* %arrayidx, align 4
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp1 = icmp eq i32 %0, %fpt
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp1, label %if.then, label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction:   store double 0.000000e+00, double* %arrayidx3, align 8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK-CM: LV: Scalar loop costs: 6.
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %0 = load i32, i32* %arrayidx, align 4
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp1 = icmp eq i32 %0, %fpt
+; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction:   br i1 %cmp1, label %if.then, label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction:   store double 0.000000e+00, double* %arrayidx3, align 8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK-CM: LV: Vector loop of width 2 costs: 5.
+; CHECK-CM: LV: Selecting VF: 2.
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx = getelementptr %CF_marker_x, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE %0 = load %arrayidx
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %cmp1 = icmp %0, %fpt
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx3 = getelementptr %y_data, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "BRANCH-ON-MASK ir<%cmp1>\l"
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE store 0.000000e+00, %arrayidx3
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For loop backedge cost (br)
+; CHECK-VP: LV: Vector loop of width 1 costs: 6.
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "CLONE %arrayidx = getelementptr %CF_marker_x, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN load ir<%arrayidx>
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l""  %cmp1 = icmp %0, %fpt
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %arrayidx3 = getelementptr %y_data, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp1>\l"
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %arrayidx3
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For loop backedge cost (br)
+; CHECK-VP: LV: Vector loop of width 2 costs: 4.
+; CHECK-VP: LV: Selecting VF: 2.
+define i32 @predicated_store(i32* nocapture readonly %CF_marker_x, double* nocapture %y_data, i32 %num_rows, i32 %fpt) {
+entry:
+  %cmp8 = icmp sgt i32 %num_rows, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %num_rows to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 undef
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %0, %fpt
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv
+  store double 0.000000e+00, double* %arrayidx3, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: vif
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ]
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = load float, float* %arrayidx.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp5.us = fcmp ogt float %1, 0.000000e+00
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp5.us, label %if.then.us, label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction:   store float %1, float* %arrayidx9.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+; CHECK-CM: LV: Scalar loop costs: 6.
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ]
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load float, float* %arrayidx.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp5.us = fcmp ogt float %1, 0.000000e+00
+; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction:   br i1 %cmp5.us, label %if.then.us, label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction:   store float %1, float* %arrayidx9.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+; CHECK-CM: LV: Vector loop of width 2 costs: 5.
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ]
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load float, float* %arrayidx.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cmp5.us = fcmp ogt float %1, 0.000000e+00
+; CHECK-CM: LV: Found an estimated cost of 9 for VF 4 For instruction:   br i1 %cmp5.us, label %if.then.us, label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv
+; CHECK-CM: LV: Found an estimated cost of 8 for VF 4 For instruction:   store float %1, float* %arrayidx9.us, align 4
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   br label %for.inc.us
+; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+; CHECK-CM: LV: Vector loop of width 4 costs: 6.
+; CHECK-CM: LV: Selecting VF: 2.
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE %1 = load %arrayidx.us
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %cmp5.us = fcmp %1, 0.000000e+00
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx9.us = getelementptr %a, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l"
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE store %1, %arrayidx9.us
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For loop backedge cost (br)
+; CHECK-VP: LV: Vector loop of width 1 costs: 6.
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN load ir<%arrayidx.us>
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l""  %cmp5.us = fcmp %1, 0.000000e+00
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %arrayidx9.us = getelementptr %a, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l"
+; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "REPLICATE store %1, %arrayidx9.us
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For loop backedge cost (br)
+; CHECK-VP: LV: Vector loop of width 2 costs: 5.
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe: "WIDEN load ir<%arrayidx.us>
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe: "WIDEN\l""  %cmp5.us = fcmp %1, 0.000000e+00
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %arrayidx9.us = getelementptr %a, %indvars.iv
+; CHECK-VP: LV: Found an estimated cost of 9 for VF 4 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l"
+; CHECK-VP: LV: Found an estimated cost of 8 for VF 4 For recipe: "REPLICATE store %1, %arrayidx9.us
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 4 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For loop backedge cost (br)
+; CHECK-VP: LV: Vector loop of width 4 costs: 5.
+; CHECK-VP: LV: Selecting VF: 2.
+define i32 @vif(i32 %ntimes, i32 %LEN, float* %a, float* %b, float* %c, float* %d, float* %e, i32 %aa, i32 %bb, i32 %cc) {
+entry:
+  %cmp27 = icmp sgt i32 %ntimes, 0
+  br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp225 = icmp sgt i32 %LEN, 0
+  br i1 %cmp225, label %for.cond1.preheader.us.preheader, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:                    ; preds = %for.cond1.preheader.lr.ph
+  br label %for.cond1.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
+  %wide.trip.count = zext i32 %LEN to i64
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
+  %nl.028.us = phi i32 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.cond1.preheader.us, %for.inc.us
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ]
+  %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx.us, align 4
+  %cmp5.us = fcmp ogt float %0, 0.000000e+00
+  br i1 %cmp5.us, label %if.then.us, label %for.inc.us
+
+if.then.us:                                       ; preds = %for.body4.us
+  %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %0, float* %arrayidx9.us, align 4
+  br label %for.inc.us
+
+for.inc.us:                                       ; preds = %if.then.us, %for.body4.us
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.inc.us
+  %inc12.us = add nuw nsw i32 %nl.028.us, 1
+  %exitcond30.not = icmp eq i32 %inc12.us, %ntimes
+  br i1 %exitcond30.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader.us
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %nl.028 = phi i32 [ %inc12, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %inc12 = add nuw nsw i32 %nl.028, 1
+  %exitcond31.not = icmp eq i32 %inc12, %ntimes
+  br i1 %exitcond31.not, label %for.cond.cleanup.loopexit33, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit33:                      ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit33, %for.cond.cleanup.loopexit, %entry
+  ret i32 0
+}
Index: llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -1,6 +1,7 @@
 ; REQUIRES: asserts
 
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -cost-using-vplan=false %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM-OLD %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -cost-using-vplan=true %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM-VPLAN %s
 ; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s
 
 ; Test case from PR41294.
@@ -8,9 +9,12 @@
 ; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
 ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
 
-; CM: LV: Scalar loop costs: 7.
-; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+; CM-OLD: LV: Scalar loop costs: 7.
+; CM-OLD: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-OLD-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+; CM-VPLAN: LV: Vector loop of width 1 costs: 7.
+; CM-VPLAN: LV: Found an estimated cost of 5 for VF 2 For recipe:  "REPLICATE %a = extractvalue %sv
+; CM-VPLAN-NEXT: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %b = extractvalue %sv
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -57,9 +61,12 @@
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @pow(float, float) readnone nounwind
 
-; CM: LV: Scalar loop costs: 16.
-; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+; CM-OLD: LV: Scalar loop costs: 16.
+; CM-OLD: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
+; CM-OLD-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+; CM-VPLAN: LV: Vector loop of width 1 costs: 16.
+; CM-VPLAN: LV: Found an estimated cost of 5 for VF 2 For recipe:  "REPLICATE %a = extractvalue %sv
+; CM-VPLAN-NEXT: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %b = extractvalue %sv
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -cost-using-vplan=false -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -cost-using-vplan=true -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 ; This test shows extremely high interleaving cost that, probably, should be fixed.
 ; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
@@ -11,8 +12,10 @@
 %pair = type { i8, i8 }
 
 ; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
-; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK-CM: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK-CM: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK-VP: Found an estimated cost of 20 for VF 2 For recipe:   {{.*}} load
+; CHECK-VP: Found an estimated cost of 0 for VF 2 For recipe:   {{.*}} load
 ; CHECK: vector.body
 ; CHECK: load i8
 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,7 +1,11 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -22,6 +26,12 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+; VP_8-LABEL:  Checking a loop in "i8_factor_2"
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i8_factor_2"
+; VP_16:         Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
@@ -58,6 +68,15 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+; VP_4-LABEL:  Checking a loop in "i16_factor_2"
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i16_factor_2"
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i16_factor_2"
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -99,6 +118,18 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+; VP_2-LABEL:  Checking a loop in "i32_factor_2"
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_4-LABEL:  Checking a loop in "i32_factor_2"
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i32_factor_2"
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i32_factor_2"
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
@@ -140,6 +171,18 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VP_2-LABEL:  Checking a loop in "i64_factor_2"
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_4-LABEL:  Checking a loop in "i64_factor_2"
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i64_factor_2"
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i64_factor_2"
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
@@ -172,6 +215,10 @@
 ; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VP_2-LABEL: Checking a loop in "i64_factor_8"
+; VP_2:         Found an estimated cost of 6 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 8 at %tmp2, ir<%tmp0>
+; VP_2:         Found an estimated cost of 7 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:    Found an estimated cost of 7 for VF 2 For recipe: "REPLICATE store 0, %tmp1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
Index: llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -1,12 +1,14 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -cost-using-vplan=false -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -cost-using-vplan=true -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK-CM:    LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 2 For Recipe: {{.*}} zext
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(i64* %a, i64 %n) {
@@ -27,7 +29,8 @@
 
 ; CHECK-LABEL: PR33193
 ; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
+; CHECK-CM:    LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
+; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 8 For Recipe: {{.*}} zext
 ; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
 %struct.a = type { i32, i8 }
 define void @PR33193(%struct.a* %a, i64 %n) {
Index: llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -cost-using-vplan=false -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -cost-using-vplan=true -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -18,8 +19,9 @@
 ; Cost of udiv:
 ;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
 ;
-; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK:    Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3 (S->V)
 ;
 define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
 entry:
@@ -59,8 +61,9 @@
 ; Cost of store:
 ;   (store(4) + extractelement(3)) / 2 = 3
 ;
-; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK:    Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-CM: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-VP: Found an estimated cost of 3 for VF 2 For recipe: "REPLICATE store %tmp2, %tmp0
 ;
 define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -100,8 +103,10 @@
 ;
 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
-; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK-CM: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp3 = add %tmp2, %x
+; CHECK-VP: Found an estimated cost of 4 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3
 ;
 define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -145,8 +150,10 @@
 ;
 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
-; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp2 = add %tmp1, %x
+; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store %tmp2, %tmp0
 ;
 define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -197,11 +204,16 @@
 ; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
-; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
-; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
-; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK-CM:  Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
+; CHECK-CM:  Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK-CM:  Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK-CM:  Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK-CM:  Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK-VP:  Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l"" %tmp2 = add %tmp1, %x
+; CHECK-VP:  Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp3 = sdiv %tmp1, %tmp2
+; CHECK-VP:  Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp3, %tmp2
+; CHECK-VP:  Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp5 = sub %tmp4, %x
+; CHECK-VP:  Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store %tmp5, %tmp0
 ;
 define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
Index: llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -1,7 +1,11 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -22,6 +26,12 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+; VP_8-LABEL:  Checking a loop in "i8_factor_2"
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i8_factor_2"
+; VP_16:         Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
@@ -58,6 +68,15 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+; VP_4-LABEL:  Checking a loop in "i16_factor_2"
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i16_factor_2"
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i16_factor_2"
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -99,6 +118,18 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+; VP_2-LABEL:  Checking a loop in "i32_factor_2"
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_2:          Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_4-LABEL:  Checking a loop in "i32_factor_2"
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i32_factor_2"
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i32_factor_2"
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
@@ -130,6 +161,12 @@
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VP_4-LABEL: Checking a loop in "half_factor_2"
+; VP_4:         Found an estimated cost of 40 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:         Found an estimated cost of 32 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_8-LABEL: Checking a loop in "half_factor_2"
+; VP_8:         Found an estimated cost of 80 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:         Found an estimated cost of 64 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -1,7 +1,11 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -34,6 +38,20 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+; VP_2-LABEL:  Checking a loop in "i8_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-LABEL:  Checking a loop in "i8_factor_2"
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i8_factor_2"
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i8_factor_2"
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
@@ -75,6 +93,20 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+; VP_2-LABEL:  Checking a loop in "i16_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-LABEL:  Checking a loop in "i16_factor_2"
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i16_factor_2"
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i16_factor_2"
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -116,6 +148,20 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+; VP_2-LABEL:  Checking a loop in "i32_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-LABEL:  Checking a loop in "i32_factor_2"
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "i32_factor_2"
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "i32_factor_2"
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
@@ -157,6 +203,26 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 576 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+; VP_2-LABEL:  Checking a loop in "i64_factor_2"
+; VP_2:          Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 16 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-LABEL:  Checking a loop in "i64_factor_2"
+; VP_4:          Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 48 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-LABEL:  Checking a loop in "i64_factor_2"
+; VP_8:          Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 160 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-LABEL: Checking a loop in "i64_factor_2"
+; VP_16:         Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 576 for VF 16 For recipe: "REPLICATE store 0, %tmp1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
@@ -198,6 +264,22 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VP_2-LABEL:  Checking a loop in "f16_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_4-LABEL:  Checking a loop in "f16_factor_2"
+; VP_4:          Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 40 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_8-LABEL:  Checking a loop in "f16_factor_2"
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "f16_factor_2"
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 0
@@ -239,6 +321,20 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
+; VP_2-LABEL:  Checking a loop in "f32_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_4-LABEL:  Checking a loop in "f32_factor_2"
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_4:          Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_8-LABEL:  Checking a loop in "f32_factor_2"
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_8:          Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
+; VP_16-LABEL: Checking a loop in "f32_factor_2"
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0>
+; VP_16:         Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at <badref>, ir<%tmp1>
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 0
@@ -280,6 +376,26 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 544 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
+; VP_2-LABEL:  Checking a loop in "f64_factor_2"
+; VP_2:          Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_4-LABEL:  Checking a loop in "f64_factor_2"
+; VP_4:          Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 40 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_8-LABEL:  Checking a loop in "f64_factor_2"
+; VP_8:          Found an estimated cost of 272 for VF 8 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 144 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_16-LABEL: Checking a loop in "f64_factor_2"
+; VP_16:         Found an estimated cost of 1056 for VF 16 For recipe: "REPLICATE %tmp2 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 544 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 0
@@ -333,6 +449,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1
+; VP_2-LABEL:  Checking a loop in "i8_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-LABEL:  Checking a loop in "i8_factor_3"
+; VP_4:          Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-LABEL:  Checking a loop in "i8_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-LABEL: Checking a loop in "i8_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, %i8.3* %data, i64 %i, i32 0
@@ -385,6 +529,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2
+; VP_2-LABEL:  Checking a loop in "i16_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-LABEL:  Checking a loop in "i16_factor_3"
+; VP_4:          Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-LABEL:  Checking a loop in "i16_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-LABEL: Checking a loop in "i16_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, %i16.3* %data, i64 %i, i32 0
@@ -437,6 +609,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4
+; VP_2-LABEL:  Checking a loop in "i32_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-LABEL:  Checking a loop in "i32_factor_3"
+; VP_4:          Found an estimated cost of 24 for VF 4 For recipe: "WIDEN load ir<%tmp0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0>
+; VP_4-NEXT:     Found an estimated cost of 24 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0>
+; VP_8-LABEL:  Checking a loop in "i32_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-LABEL: Checking a loop in "i32_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, %i32.3* %data, i64 %i, i32 0
@@ -489,6 +689,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 864 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
+; VP_2-LABEL:  Checking a loop in "i64_factor_3"
+; VP_2:          Found an estimated cost of 36 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-LABEL:  Checking a loop in "i64_factor_3"
+; VP_4:          Found an estimated cost of 120 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-LABEL:  Checking a loop in "i64_factor_3"
+; VP_8:          Found an estimated cost of 432 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 240 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-LABEL: Checking a loop in "i64_factor_3"
+; VP_16:         Found an estimated cost of 1632 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 864 for VF 16 For recipe: "REPLICATE store 0, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 0
@@ -541,6 +769,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
+; VP_2-LABEL:  Checking a loop in "f16_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_4-LABEL:  Checking a loop in "f16_factor_3"
+; VP_4:          Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_8-LABEL:  Checking a loop in "f16_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_16-LABEL: Checking a loop in "f16_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 0
@@ -593,6 +849,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
+; VP_2-LABEL:  Checking a loop in "f32_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_4-LABEL:  Checking a loop in "f32_factor_3"
+; VP_4:          Found an estimated cost of 24 for VF 4 For recipe: "WIDEN load ir<%tmp0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0.000000e+00>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0.000000e+00>
+; VP_4-NEXT:     Found an estimated cost of 24 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0.000000e+00>
+; VP_8-LABEL:  Checking a loop in "f32_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_16-LABEL: Checking a loop in "f32_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 0
@@ -645,6 +929,34 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 816 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
+; VP_2-LABEL:  Checking a loop in "f64_factor_3"
+; VP_2:          Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_4-LABEL:  Checking a loop in "f64_factor_3"
+; VP_4:          Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_8-LABEL:  Checking a loop in "f64_factor_3"
+; VP_8:          Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_16-LABEL: Checking a loop in "f64_factor_3"
+; VP_16:         Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 0
@@ -708,6 +1020,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1
+; VP_2-LABEL:  Checking a loop in "i8_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3
+; VP_4-LABEL: Checking a loop in "i8_factor_4"
+; VP_4:         Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:    Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-NEXT:    Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0, %tmp3
+; VP_8-LABEL:  Checking a loop in "i8_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3
+; VP_16-LABEL: Checking a loop in "i8_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0
@@ -771,6 +1119,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2
+; VP_2-LABEL:  Checking a loop in "i16_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3
+; VP_4-LABEL:  Checking a loop in "i16_factor_4"
+; VP_4:          Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-NEXT:     Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0, %tmp3
+; VP_8-LABEL:  Checking a loop in "i16_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3
+; VP_16-LABEL: Checking a loop in "i16_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0
@@ -834,6 +1218,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4
+; VP_2-LABEL:  Checking a loop in "i32_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3
+; VP_4-LABEL:  Checking a loop in "i32_factor_4"
+; VP_4:          Found an estimated cost of 32 for VF 4 For recipe: "WIDEN load ir<%tmp0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp3>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0>
+; VP_4-NEXT:     Found an estimated cost of 32 for VF 4 For recipe: "WIDEN store ir<%tmp3>, ir<0>
+; VP_8-LABEL:  Checking a loop in "i32_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3
+; VP_16-LABEL: Checking a loop in "i32_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0
@@ -897,6 +1317,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 1152 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8
+; VP_2-LABEL:  Checking a loop in "i64_factor_4"
+; VP_2:          Found an estimated cost of 48 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 32 for VF 2 For recipe: "REPLICATE store 0, %tmp3
+; VP_4-LABEL:  Checking a loop in "i64_factor_4"
+; VP_4:          Found an estimated cost of 160 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2
+; VP_4-NEXT:     Found an estimated cost of 96 for VF 4 For recipe: "REPLICATE store 0, %tmp3
+; VP_8-LABEL:  Checking a loop in "i64_factor_4"
+; VP_8:          Found an estimated cost of 576 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 320 for VF 8 For recipe: "REPLICATE store 0, %tmp3
+; VP_16-LABEL: Checking a loop in "i64_factor_4"
+; VP_16:         Found an estimated cost of 2176 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1152 for VF 16 For recipe: "REPLICATE store 0, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 0
@@ -960,6 +1416,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VP_2-LABEL:  Checking a loop in "f16_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp3
+; VP_4-LABEL:  Checking a loop in "f16_factor_4"
+; VP_4:          Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_4-NEXT:     Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp3
+; VP_8-LABEL:  Checking a loop in "f16_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp3
+; VP_16-LABEL: Checking a loop in "f16_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0
@@ -1023,6 +1515,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VP_2-LABEL:  Checking a loop in "f32_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp3
+; VP_4-LABEL:  Checking a loop in "f32_factor_4"
+; VP_4:          Found an estimated cost of 32 for VF 4 For recipe: "WIDEN load ir<%tmp0>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp3>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0.000000e+00>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0.000000e+00>
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0.000000e+00>
+; VP_4-NEXT:     Found an estimated cost of 32 for VF 4 For recipe: "WIDEN store ir<%tmp3>, ir<0.000000e+00>
+; VP_8-LABEL:  Checking a loop in "f32_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp3
+; VP_16-LABEL: Checking a loop in "f32_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0
@@ -1086,6 +1614,42 @@
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8
+; VP_2-LABEL:  Checking a loop in "f64_factor_4"
+; VP_2:          Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_2-NEXT:     Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_2-NEXT:     Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp3
+; VP_4-LABEL:  Checking a loop in "f64_factor_4"
+; VP_4:          Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_4-NEXT:     Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_4-NEXT:     Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp3
+; VP_8-LABEL:  Checking a loop in "f64_factor_4"
+; VP_8:          Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_8-NEXT:     Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_8-NEXT:     Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp3
+; VP_16-LABEL: Checking a loop in "f64_factor_4"
+; VP_16:         Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1
+; VP_16-NEXT:    Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2
+; VP_16-NEXT:    Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp3
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 0
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefix=CHECK
-; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -cost-using-vplan -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST-VPLAN
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -10,6 +11,10 @@
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST-VPLAN: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %and515 = shl %l41, 3
+; CHECK-COST-VPLAN: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %l45 = and %and515, 131072
+; CHECK-COST-VPLAN: LV: Found an estimated cost of 2 for VF 4 For recipe: "WIDEN\l""  %and515 = shl %l41, 3
+; CHECK-COST-VPLAN: LV: Found an estimated cost of 2 for VF 4 For recipe: "WIDEN\l""  %l45 = and %and515, 131072
 ; CHECK-NOT: vector.body
 
 define void @test([101 x i32] *%src, i32 %N) #0 {
Index: llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
@@ -1,7 +1,10 @@
 ; REQUIRES: asserts
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
-; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 ; Check costs for branches inside a vectorized loop around predicated
 ; blocks. Each such branch will be guarded with an extractelement from the
@@ -32,7 +35,10 @@
 for.end.loopexit:
   ret void
 
-; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+; CHECK-CM: LV: Found an estimated cost of 7 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+; CHECK-VP: LV: Found an estimated cost of 7 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp55>
+; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 2 For recipe: {{.*}}  br
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For loop backedge cost (br)
 }
Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -1,6 +1,9 @@
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
-; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ; REQUIRES: asserts
 ;
 ; Check that a scalarized load does not get operands scalarization costs added.
@@ -22,6 +25,8 @@
 for.end:
   ret void
 
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
+; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe:   "REPLICATE %mul = mul %iv, %s
+; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe:   "REPLICATE %ld = load %bct
 }
Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
@@ -1,7 +1,11 @@
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
 ; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
-; RUN:   | FileCheck %s
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ; REQUIRES: asserts
 ;
 ; Check that a scalarized load does not get a zero cost in a vectorized
@@ -24,5 +28,6 @@
 for.end:
   ret i32 %acc_next
 
-; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+; CHECK-CM: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+; CHECK-VP: Found an estimated cost of 4 for VF 4 For recipe:   "REPLICATE %ld = load %gep
 }
Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
@@ -1,8 +1,12 @@
 ; REQUIRES: asserts
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
 ; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
-; RUN:   FileCheck %s
+; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ;
 ; Check that a scalarized load/store does not get a cost for insterts/
 ; extracts, since z13 supports element load/store.
@@ -27,7 +31,9 @@
 ; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
 ; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
 
-; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
-; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe:   "REPLICATE %tmp1 = load %tmp0
+; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe:   "REPLICATE store %tmp2, %tmp0
 }
 
Index: llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -1,7 +1,10 @@
 ; REQUIRES: asserts
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
-; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ;
 ; Check that some cost estimations for interleave groups make sense.
 
@@ -11,10 +14,11 @@
 ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
 ;
 ; CHECK: LV: Checking a loop in "fun0"
-; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+; CHECK-CM: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+; CHECK-VP: LV: Found an estimated cost of 6 for VF 4 For recipe:  "INTERLEAVE-GROUP with factor 4 at %ld0
 define void @fun0(i16 *%ptr, i16 *%dst) {
 entry:
   br label %for.body
@@ -49,7 +53,8 @@
 ; which gives a cost of 5.
 ;
 ; CHECK: LV: Checking a loop in "fun1"
-; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+; CHECK-CM: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+; CHECK-VP: LV: Found an estimated cost of 5 for VF 16 For recipe:  "INTERLEAVE-GROUP with factor 3 at %ld0
 define void @fun1(i8 *%ptr, i8 *%dst) {
 entry:
   br label %for.body
@@ -75,10 +80,11 @@
 ; produce the vector values, which gives a cost of 6.
 ;
 ; CHECK: LV: Checking a loop in "fun2"
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK-CM: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK-VP: LV: Found an estimated cost of 6 for VF 2 For recipe:  "INTERLEAVE-GROUP with factor 32 at %ld0
 define void @fun2(i8 *%ptr, i8 *%dst) {
 entry:
   br label %for.body
@@ -115,10 +121,11 @@
 ; vector register boundary.
 ;
 ; CHECK: LV: Checking a loop in "fun3"
-; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK-CM: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK-VP: LV: Found an estimated cost of 7 for VF 2 For recipe:  "INTERLEAVE-GROUP with factor 30 at %ld0
 define void @fun3(i8 *%ptr, i8 *%dst) {
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
+++ llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
@@ -1,7 +1,10 @@
 ; REQUIRES: asserts
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \
 ; RUN:   -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \
-; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ;
 ; Check that the loop vectorizer performs memory interleaving with accurate
 ; cost estimations.
@@ -27,7 +30,8 @@
   ret void
 
 ; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
-; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK-CM: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK-VP: LV: Found an estimated cost of 3 for VF 4 For recipe:   "INTERLEAVE-GROUP with factor 2 at %tmp1
 ;        (vl; vl; vperm)
 }
 
@@ -59,12 +63,14 @@
 ; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
 ; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
 
-; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe:   "INTERLEAVE-GROUP with factor 2 at %tmp1
 ;            (vl; vl; vperm, vpkg)
 
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
-; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe:   "INTERLEAVE-GROUP with factor 2
 ;            (vmrlf; vmrhf; vst; vst)
 }
 
Index: llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
+++ llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
@@ -5,9 +5,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-; CHECK: Found an estimated cost of 4 for VF 1 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 4 for VF 2 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %neg = fneg float %{{.*}}
+; CHECK: Found an estimated cost of 4 for VF 1 For {{.*}}   %neg = fneg
+; CHECK: Found an estimated cost of 4 for VF 2 For {{.*}}   %neg = fneg
+; CHECK: Found an estimated cost of 4 for VF 4 For {{.*}}   %neg = fneg
 define void @fneg_cost(float* %a, i64 %n) {
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
+++ llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@@ -5,7 +5,7 @@
 target triple = "x86_64-apple-macosx10.8.0"
 
 
-; CHECK: cost of 4 for VF 8 For instruction:   %conv = fptosi float %tmp to i8
+; CHECK: cost of 4 for VF 8 For {{.*}}  %conv = fptosi
 define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -32,38 +32,38 @@
   %conv3 = sext i8 %1 to i32
 ; sources of the mul is sext\sext from i8 
 ; use pmullw\sext seq.   
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+; SLM:  cost of 3 for VF 4 {{.*}} mul  
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i8
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %conv4 = zext i8 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i8
 ; use pmullw\zext
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 3 for VF 4 {{.*}} mul
   %conv5 = zext i8 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-120
 ; use pmullw\sext
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 3 for VF 4 {{.*}} mul
   %mul4 = mul nsw i32 -120, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\250
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %mul5 = mul nsw i32 250, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-120
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %mul6 = mul nsw i32 -120, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\250
 ; use pmullw\zext
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 3 for VF 4 {{.*}} mul
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
@@ -101,38 +101,38 @@
   %conv3 = sext i16 %1 to i32
 ; sources of the mul is sext\sext from i16 
 ; use pmulhw\pmullw\pshuf seq.   
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i16
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 11 for VF 4 {{.*}} mul
   %conv4 = zext i16 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i16
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %conv5 = zext i16 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-32000
 ; use pmulhw\pmullw\sext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %mul4 = mul nsw i32 -32000, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\64000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 11 for VF 4 {{.*}} mul
   %mul5 = mul nsw i32 64000, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-32000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 11 for VF 4 {{.*}} mul
   %mul6 = mul nsw i32 -32000, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\64000
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+; SLM:  cost of 5 for VF 4 {{.*}} mul
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
Index: llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -cost-using-vplan=false -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -cost-using-vplan=true -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -13,36 +14,60 @@
 ;
 
 ; CHECK-LABEL: reduction_i8
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
+; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "WIDEN-INDUCTION  %{{.*}} = phi
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "WIDEN-PHI %{{.*}} = phi
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = getelementptr
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = load
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = zext
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = getelementptr
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = load
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = zext
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = and
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = add
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe:  "CLONE %{{.*}} = add
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For loop induction check (add + icmp)
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN-INDUCTION %{{.*}} = phi
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN-PHI %{{.*}} = phi
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "CLONE %{{.*}} = getelementptr
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN load
+; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN\l"" %{{.*}} = zext
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "CLONE %{{.*}} = getelementptr
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN load
+; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN\l"" %{{.*}} = zext
+; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN\l"" %{{.*}} = and
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN\l"" %{{.*}} = add
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe:  "WIDEN\l"" %{{.*}} = add
+; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For loop induction check (add + icmp)
 ;
 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
 entry:
Index: llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
+++ llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -mtriple x86_64 -debug -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple x86_64 -cost-using-vplan=false -debug -disable-output 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Check that cost model is not executed twice for VF=2 when vectorization is
Index: llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -1,13 +1,17 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -cost-using-vplan=false -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -cost-using-vplan=true -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 
-; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK-CM: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK-CM: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK-CM: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK-VP: cost of 4 for VF 1 For recipe: "CLONE %conv = uitofp %tmp
+; CHECK-VP: cost of 5 for VF 2 For recipe: "WIDEN\l"" %conv = uitofp %tmp
+; CHECK-VP: cost of 6 for VF 4 For recipe: "WIDEN\l"" %conv = uitofp %tmp
 define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -1,8 +1,10 @@
-; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -cost-using-vplan=false -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -cost-using-vplan=true -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 ; REQUIRES: asserts
 
 ; CHECK: "foo"
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe:   "WIDEN\l"" %shift = ashr %val, %k
 define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
 entry:  
   br label %body
Index: llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -23,7 +23,7 @@
   %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
 
 ; A scalar select has a cost of 1 on core2
-; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
+; CHECK: cost of 1 for VF 2 {{.*}}  select
 
   %sel = select i1 %cond, i32 %6, i32 zeroinitializer
   store i32 %sel, i32* %7, align 4
@@ -51,7 +51,7 @@
   %8 = icmp ult i64 %indvars.iv, 8
 
 ; A vector select has a cost of 1 on core2
-; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+; CHECK: cost of 1 for VF 2 {{.*}}  select
 
   %sel = select i1 %8, i32 %6, i32 zeroinitializer
   store i32 %sel, i32* %7, align 4
Index: llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -12,6 +12,7 @@
 ; uniform after vectorization.
 ;
 ; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK-LABEL: @consecutive_ptr_forward(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK-NOT:   getelementptr
@@ -44,6 +45,7 @@
 ; uniform after vectorization.
 ;
 ; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK-LABEL: @consecutive_ptr_reverse(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %offset.idx = sub i64 %n, %index
@@ -82,6 +84,7 @@
 ;
 ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
 ; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK-LABEL: @interleaved_access_forward(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %[[I1:.+]] = or i64 %index, 1
@@ -99,6 +102,7 @@
 ;
 ; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
 ; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER-LABEL: @interleaved_access_forward(
 ; INTER:     vector.body
 ; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; INTER-NOT:   getelementptr
@@ -139,6 +143,7 @@
 ; recognized as uniform, and it should not be uniform after vectorization.
 ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
 ; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK-LABEL: @interleaved_access_reverse(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %offset.idx = sub i64 %n, %index
@@ -157,6 +162,7 @@
 ;
 ; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
 ; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER-LABEL: @interleaved_access_reverse(
 ; INTER:     vector.body
 ; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; INTER:       %offset.idx = sub i64 %n, %index
@@ -198,6 +204,7 @@
 ; non-uniform.
 ;
 ; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; CHECK-LABEL: @predicated_store(
 ; INTER:     vector.body
 ; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
 ; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
@@ -242,6 +249,7 @@
 ; because the stored type may required padding.
 ;
 ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
+; CHECK-LABEL: @irregular_type(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %[[I1:.+]] = or i64 %index, 1
@@ -276,6 +284,7 @@
 ; uniform after vectorization.
 ;
 ; CHECK:     LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; CHECK-LABEL: @pointer_iv_uniform(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK-NOT:   getelementptr
@@ -308,6 +317,7 @@
 ; due to scalarization of the stores.
 ;
 ; INTER-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; CHECK-LABEL: @pointer_iv_non_uniform_0(
 ; INTER:     vector.body
 ; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; INTER:       %[[I0:.+]] = shl i64 %index, 2
@@ -358,6 +368,7 @@
 ; induction variable is used by a store that will be scalarized.
 ;
 ; CHECK-NOT: LV: Found uniform instruction: %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+; CHECK-LABEL: @pointer_iv_non_uniform_1(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %next.gep = getelementptr x86_fp80, x86_fp80* %a, i64 %index
@@ -396,6 +407,7 @@
 ;
 ; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
 ; CHECK:     LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+; CHECK-LABEL: @pointer_iv_mixed(
 ; CHECK:     vector.body
 ; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
@@ -453,6 +465,7 @@
 ; INTER-NEXT:  LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
 ; INTER-NEXT:  LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
 ; INTER-NEXT:  LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1
+; INTER-LABEL: @bitcast_pointer_operand(
 ; INTER:       vector.body:
 ; INTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; INTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]]
Index: llvm/test/Transforms/LoopVectorize/loop-scalars.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -5,6 +5,7 @@
 
 ; CHECK-LABEL: vector_gep
 ; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK-LABEL: @vector_gep(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
@@ -38,6 +39,7 @@
 ; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
 ; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK-LABEL: @scalar_store(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
@@ -75,6 +77,7 @@
 ; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
 ; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK-LABEL: @expansion(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
@@ -113,6 +116,7 @@
 ; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
 ; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+; CHECK-LABEL: @no_gep_or_bitcast(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
Index: llvm/test/Transforms/LoopVectorize/phi-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/phi-cost.ll
+++ llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -1,11 +1,15 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -cost-using-vplan=false -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -cost-using-vplan=true -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: phi_two_incoming_values
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+; CHECK-CM:    LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK-CM:    LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+; CHECK-VP:    LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN-INDUCTION %i = phi %i.next, 0
+; CHECK-VP:    LV: Found an estimated cost of 1 for VF 2 For recipe: "BLEND %tmp5 = ir<%tmp1>/vp<%0> ir<%tmp4>/ir<%tmp3>
+; CHECK-LABEL: @phi_two_incoming_values(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK:         [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{.*}}
@@ -43,8 +47,11 @@
 }
 
 ; CHECK-LABEL: phi_three_incoming_values
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
-; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+; CHECK-CM:    LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK-CM:    LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+; CHECK-VP:    LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN-INDUCTION %i = phi %i.next, 0
+; CHECK-VP:    LV: Found an estimated cost of 2 for VF 2 For recipe: "BLEND %tmp8 = ir<9>/vp<%0> ir<3>/vp<%1> ir<%tmp7>/vp<%3>
+; CHECK-LABEL: @phi_three_incoming_values(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 9, i32 9>