diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -35,6 +35,8 @@
 class LoopVectorizationCostModel;
 class PredicatedScalarEvolution;
 class VPRecipeBuilder;
+class IMLAEMainLoop;
+class IMLAEEpilogueLoop;
 
 /// VPlan-based builder utility analogous to IRBuilder.
 class VPBuilder {
@@ -260,6 +262,20 @@
   /// best selected VPlan.
   void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
 
+  /// The following overloads are used to generate code for the vectorization of
+  /// the main loop and the epilogue, respectively. These functions use the
+  /// provided specialized inner loop vectorizer arguments to generate the
+  /// skeleton around the vectorized loop. They are meant to be called in two
+  /// stages. The first function sets up the skeleton and executes the selected
+  /// vplan for the main loop, while the second one completes the skeleton and
+  /// executes the vplan for the epilogue loop.
+  void executePlan(IMLAEMainLoop &LB, DominatorTree *DT) {
+    executePlanForEpilogueVectorization<IMLAEMainLoop>(LB, DT);
+  }
+  void executePlan(IMLAEEpilogueLoop &LB, DominatorTree *DT) {
+    executePlanForEpilogueVectorization<IMLAEEpilogueLoop>(LB, DT);
+  }
+
   void printPlans(raw_ostream &O) {
     for (const auto &Plan : VPlans)
       O << *Plan;
@@ -307,6 +323,9 @@
   /// reduction chain.
   void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
                                         VPRecipeBuilder &RecipeBuilder);
+
+  template <typename ILVType>
+  void executePlanForEpilogueVectorization(ILVType &LB, DominatorTree *DT);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -156,6 +156,7 @@
 
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
+static const char *VerboseDebug = DEBUG_TYPE "-verbose";
 
 /// @{
 /// Metadata attribute names
@@ -169,6 +170,17 @@
 
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+
+static cl::opt<bool> EnableEpilogueVectorization(
+    "enable-epilogue-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of residual loops."));
+
+static cl::opt<unsigned> EpilogueVectorizationForceVF(
+    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
+    cl::desc("When epilogue vectorization is enabled, and a value greater than "
+             "1 is specified, forces the given VF for all applicable epilogue "
+             "loops."));
 
 /// Loops with a known constant trip count below this number are vectorized only
 /// if no scalar iteration overheads are incurred.
@@ -885,6 +897,137 @@
   Value *reverseVector(Value *Vec) override;
 };
 
+/// Encapsulate information regarding vectorization of a loop and its epilogue.
+/// This information is meant to be updated and used across two stages of
+/// epilogue vectorization.
+struct EpilogueLoopVectorizationInfo {
+  ElementCount MainLoopVF = ElementCount::getFixed(0);
+  unsigned MainLoopUF = 0;
+  ElementCount EpilogueVF = ElementCount::getFixed(0);
+  unsigned EpilogueUF = 0;
+  BasicBlock *MainLoopIterationCountCheck = nullptr;
+  BasicBlock *EpilogueIterationCountCheck = nullptr;
+  BasicBlock *SCEVSafetyCheck = nullptr;
+  BasicBlock *MemSafetyCheck = nullptr;
+  Value *TripCount = nullptr;
+  Value *VectorTripCount = nullptr;
+
+  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
+                                unsigned EUF)
+      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
+        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {}
+};
+
+/// An extension of the inner loop vectorizer that creates a skeleton for a
+/// vectorized loop that has its epilogue (residual) also vectorized. Note:
+/// The idea is to run the vplan on a given loop twice, firstly to vectorize
+/// the main loop and secondly to vectorize the epilogue.  This is achieved by
+/// deriving two concrete strategy classes from this base class and invoking
+/// them in succession from the loop vectorizer planner.
+class InnerMainLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
+public:
+  InnerMainLoopAndEpilogueVectorizer(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetLibraryInfo *TLI,
+      const TargetTransformInfo *TTI, AssumptionCache *AC,
+      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
+      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
+        EPI(EPI) {}
+
+  /// The interface for creating a vectorized skeleton using one of two
+  /// different strategies, each corresponding to one execution of the vplan
+  /// as described above.
+  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+
+  /// Holds and updates state information required to vectorize the main loop
+  /// and its epilogue in two separate passes. This setup helps us avoid
+  /// regenerating and recomputing runtime safety checks. It also helps us to
+  /// shorten the iteration-count-check path length for the cases where the
+  /// iteration count of the loop is so small that the main vector loop is
+  /// completely skipped.
+  EpilogueLoopVectorizationInfo &EPI;
+};
+
+/// A specialized derived class of inner loop vectorizer that performs
+/// vectorization of *main* loops in the process of vectorizing loops and their
+/// epilogues.
+class IMLAEMainLoop : public InnerMainLoopAndEpilogueVectorizer {
+public:
+  IMLAEMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+                DominatorTree *DT, const TargetLibraryInfo *TLI,
+                const TargetTransformInfo *TTI, AssumptionCache *AC,
+                OptimizationRemarkEmitter *ORE,
+                EpilogueLoopVectorizationInfo &EPI,
+                LoopVectorizationLegality *LVL,
+                llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                ProfileSummaryInfo *PSI)
+      : InnerMainLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC,
+                                           ORE, EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *main loop* strategy (ie the first pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+  /// Prints debug traces before vplan execution, when trace information is
+  /// requested.
+  void printDebugTracesAtStart();
+
+  /// Prints debug traces after vplan execution, when trace information is
+  /// requested.
+  void printDebugTracesAtEnd();
+
+protected:
+  /// Emits an iteration count bypass check once for the main loop (when \p
+  /// ForEpilogue is false) and once for the epilogue loop (when \p
+  /// ForEpilogue is true).
+  BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+                                             bool ForEpilogue);
+};
+
+// A specialized derived class of inner loop vectorizer that performs
+// vectorization of *epilogue* loops in the process of vectorizing loops and
+// their epilogues.
+class IMLAEEpilogueLoop : public InnerMainLoopAndEpilogueVectorizer {
+public:
+  IMLAEEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE,
+                    EpilogueLoopVectorizationInfo &EPI,
+                    LoopVectorizationLegality *LVL,
+                    llvm::LoopVectorizationCostModel *CM,
+                    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
+      : InnerMainLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC,
+                                           ORE, EPI, LVL, CM, BFI, PSI) {}
+  /// Implements the interface for creating a vectorized skeleton using the
+  /// *epilogue loop* strategy (ie the second pass of vplan execution).
+  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+
+  /// Prints debug traces before vplan execution, when trace information is
+  /// requested.
+  void printDebugTracesAtStart();
+
+  /// Prints debug traces after vplan execution, when trace information is
+  /// requested.
+  void printDebugTracesAtEnd();
+
+protected:
+  /// Emits an iteration count bypass check after the main vector loop has
+  /// finished to see if there are any iterations left to execute by either
+  /// the vector epilogue or the scalar epilogue.
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
+                                                      BasicBlock *Bypass,
+                                                      BasicBlock *Insert);
+
+  /// This function is very similar to and only differs from
+  /// `InnerLoopVectorizer::createInductionResumeValues` in the way it handles
+  /// the bypass values for the resume values.
+  void createInductionResumeValues(Loop *L, Value *VectorTripCount,
+                                   BasicBlock *VecEpilogIterCheck);
+};
 } // end namespace llvm
 
 /// Look for a meaningful debug location on the instruction or it's
@@ -1076,6 +1219,7 @@
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+  VectorizationFactor selectEpilogueVectorizationFactor(unsigned MaxVF);
 
   /// Setup cost-based decisions for user vectorization factor.
   void selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1608,6 +1752,11 @@
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
+  /// Determines if we have the infrastructure to vectorize loop \p L and its
+  /// epilogue, assuming the main loop is vectorized by \p VF.
+  bool isCandidateForEpilogueVectorization(const Loop &L,
+                                           const ElementCount VF) const;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -1650,6 +1799,9 @@
 
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVFs;
 };
 
 } // end namespace llvm
@@ -5427,6 +5579,13 @@
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
+
+    // If profitable add it to ProfitableVF list.
+    if (VectorCost < ScalarCost) {
+      ProfitableVFs.push_back(VectorizationFactor(
+          {ElementCount::getFixed(i), (unsigned)VectorCost}));
+    }
+
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -5450,6 +5609,80 @@
   return Factor;
 }
 
+bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+    const Loop &L, ElementCount VF) const {
+  // Cross iteration phis such as reductions need special handling and are
+  // currently unsupported.
+  for (PHINode &Phi : L.getHeader()->phis())
+    if (Legal->isFirstOrderRecurrence(&Phi) || Legal->isReductionVariable(&Phi))
+      return false;
+
+  // Phis with uses outside of the loop require special handling and are
+  // currently unsupported.
+  for (auto &Entry : Legal->getInductionVars()) {
+    // Look for uses of the value of the induction at the last iteration.
+    Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+    for (User *U : PostInc->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+    // Look for uses of penultimate value of the induction.
+    for (User *U : Entry.first->users())
+      if (!L.contains(cast<Instruction>(U)))
+        return false;
+  }
+
+  // Induction variables that are widened require special handling that is
+  // currently not supported.
+  for (auto &Entry : Legal->getInductionVars())
+    if (!(isScalarAfterVectorization(Entry.first, VF) ||
+          isProfitableToScalarize(Entry.first, VF)))
+      return false;
+
+  return true;
+}
+
+VectorizationFactor
+LoopVectorizationCostModel::selectEpilogueVectorizationFactor(unsigned MainLoopVF) {
+  // FIXME: This is a very crude heuristic. We need a much better cost-model.
+  VectorizationFactor Result = VectorizationFactor::Disabled();
+  if (!EnableEpilogueVectorization) {
+    LLVM_DEBUG({ dbgs() << "LEV: Epilogue vectorization is disabled.\n"; });
+    return Result;
+  }
+  // Not really a cost consideration, but check for unsupported cases here to
+  // simplify the logic.
+  if (!isCandidateForEpilogueVectorization(
+          *TheLoop, ElementCount::getFixed(MainLoopVF))) {
+    LLVM_DEBUG({
+      dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
+                "not a supported candidate.\n";
+    });
+    return Result;
+  }
+
+  if (EpilogueVectorizationForceVF > 1) {
+    LLVM_DEBUG(
+        { dbgs() << "LEV: Epilogue vectorization factor is forced.\n"; });
+    return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+  }
+  // Original loop vector factor should be at least 16.
+  if (MainLoopVF < 16) {
+    return Result;
+  }
+  for (auto &NextVF : ProfitableVFs) {
+    if ((NextVF.Width.getFixedValue() < MainLoopVF) &&
+        (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost)) {
+      Result = NextVF;
+    }
+  }
+  if (!(Result == VectorizationFactor::Disabled()))
+    LLVM_DEBUG({
+      dbgs() << "LEV: Vectorizing epilogue loop with VF = "
+             << Result.Width.getFixedValue() << "\n";
+    });
+  return Result;
+}
+
 std::pair<unsigned, unsigned>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
@@ -7083,6 +7316,37 @@
   ILV.fixVectorizedLoop();
 }
 
+template <typename ILVType>
+void LoopVectorizationPlanner::executePlanForEpilogueVectorization(
+    ILVType &ILV, DominatorTree *DT) {
+  // Perform the actual loop transformation.
+
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  VPCallbackILV CallbackILV(ILV);
+
+  assert(BestVF.hasValue() && "Vectorization Factor is missing");
+
+  VPTransformState State{*BestVF, BestUF,      LI,
+                         DT,     ILV.Builder, ILV.VectorLoopValueMap,
+                         &ILV,   CallbackILV};
+
+  ILV.printDebugTracesAtStart();
+
+  State.CFG.PrevBB = ILV.createEpilogueVectorizedLoopSkeleton();
+  State.TripCount = ILV.getOrCreateTripCount(nullptr);
+  State.CanonicalIV = ILV.Induction;
+
+  // 2. Copy and widen instructions from the old loop into the new loop.
+  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+  VPlans.front()->execute(&State);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
+
+  ILV.printDebugTracesAtEnd();
+}
+
 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
   BasicBlock *Latch = OrigLoop->getLoopLatch();
@@ -7181,6 +7445,332 @@
   }
 }
 
+//===--------------------------------------------------------------------===//
+// IMLAEMainLoop
+//===--------------------------------------------------------------------===//
+
+BasicBlock *IMLAEMainLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Generate the code to check the minimum iteration count of the vector
+  // epilogue (see below).
+  EPI.EpilogueIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
+  EPI.EpilogueIterationCountCheck->setName("iter.check");
+
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  BasicBlock *SavedPreHeader = LoopVectorPreHeader;
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.SCEVSafetyCheck = SavedPreHeader;
+
+  // Generate the code that checks at runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  SavedPreHeader = LoopVectorPreHeader;
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+  // If a safety check was generated save/overwite it.
+  if (SavedPreHeader != LoopVectorPreHeader)
+    EPI.MemSafetyCheck = SavedPreHeader;
+
+  // Generate the iteration count check for the main loop, *after* the check
+  // for the epilogue loop, so that the path-length is shorter for the case
+  // that goes directly through the vector epilogue. The longer-path length for
+  // the main loop is compensated for, by the gain from vectorizing the larger
+  // trip count. Note: the branch will get updated later on when we vectorize
+  // the epilogue.
+  EPI.MainLoopIterationCountCheck =
+      emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  EPI.VectorTripCount = CountRoundDown;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Skip induction resume value creation here because they will be created in
+  // the second pass. If we created them here, they wouldn't be used anyway,
+  // because the vplan in the second pass still contains the inductions from the
+  // original loop.
+
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void IMLAEMainLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void IMLAEMainLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
+BasicBlock *IMLAEMainLoop::emitMinimumIterationCountCheck(Loop *L,
+                                                          BasicBlock *Bypass,
+                                                          bool ForEpilogue) {
+  assert(L && "Expected valid Loop.");
+  assert(Bypass && "Expected valid bypass basic block.");
+  unsigned VFactor =
+      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
+  Value *Count = getOrCreateTripCount(L);
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // main vector loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+      "min.iters.check");
+
+  if (!ForEpilogue)
+    TCCheckBlock->setName("vector.main.loop.iter.check");
+
+  // Create new preheader for vector loop.
+  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+                                   DT, LI, nullptr, "vector.ph");
+
+  if (ForEpilogue) {
+    assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+                                 DT->getNode(Bypass)->getIDom()) &&
+           "TC check is expected to dominate Bypass");
+
+    // Update dominator for Bypass & LoopExit.
+    DT->changeImmediateDominator(Bypass, TCCheckBlock);
+    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+    LoopBypassBlocks.push_back(TCCheckBlock);
+
+    // Save the trip count so we don't have to regenerate it in the
+    // vec.epilog.iter.check. This is safe to do because the trip count
+    // generated here dominates the vector epilog iter check.
+    EPI.TripCount = Count;
+  }
+
+  ReplaceInstWithInst(
+      TCCheckBlock->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  return TCCheckBlock;
+}
+
+//===--------------------------------------------------------------------===//
+// IMLAEEpilogueLoop
+//===--------------------------------------------------------------------===//
+
+BasicBlock *IMLAEEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
+
+  // Now, compare the remaining count and if there aren't enough iterations to
+  // execute the vectorized epilogue skip to the scalar part.
+  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+  LoopVectorPreHeader =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, "vec.epilog.ph");
+  emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
+                                          VecEpilogueIterationCountCheck);
+
+  // Adjust the control flow taking the state info from the main loop
+  // vectorization into account.
+  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
+         "expected this to be saved from the previous pass.");
+  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+
+  DT->changeImmediateDominator(LoopVectorPreHeader,
+                               EPI.MainLoopIterationCountCheck);
+
+  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
+      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  if (EPI.SCEVSafetyCheck)
+    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+  if (EPI.MemSafetyCheck)
+    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
+        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+
+  DT->changeImmediateDominator(
+      VecEpilogueIterationCountCheck,
+      VecEpilogueIterationCountCheck->getSinglePredecessor());
+
+  DT->changeImmediateDominator(LoopScalarPreHeader,
+                               EPI.EpilogueIterationCountCheck);
+  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
+
+  // Keep track of bypass blocks, as they feed start values to the induction
+  // phis in the scalar loop preheader.
+  if (EPI.SCEVSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
+  if (EPI.MemSafetyCheck)
+    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
+  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+
+  // Generate a resume induction for the vector epilogue and put it in the
+  // vector epilogue preheader
+  Type *IdxTy = Legal->getWidestInductionType();
+  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
+                                         LoopVectorPreHeader->getFirstNonPHI());
+  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
+  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
+                           EPI.MainLoopIterationCountCheck);
+
+  // Generate the induction variable.
+  OldInduction = Legal->getPrimaryInduction();
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Value *StartIdx = EPResumeVal;
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Generate induction resume values. These variables save the new starting
+  // indexes for the scalar loop. They are used to test if there are any tail
+  // iterations left once the vector loop has completed.
+  createInductionResumeValues(Lp, CountRoundDown,
+                              VecEpilogueIterationCountCheck);
+
+  AddRuntimeUnrollDisableMetaData(Lp);
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
+void IMLAEEpilogueLoop::createInductionResumeValues(
+    Loop *L, Value *VectorTripCount, BasicBlock *VecEpilogIterCheck) {
+  assert(VectorTripCount && L && VecEpilogIterCheck &&
+         "Expected valid arguments");
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop and/or the vectorized epilogue loop.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
+  for (auto &InductionEntry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = InductionEntry.first;
+    InductionDescriptor II = InductionEntry.second;
+
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal =
+        PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                        LoopScalarPreHeader->getTerminator());
+    // Copy original phi DL over to the new one.
+    BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+    Value *&EndValue = IVEndValues[OrigPhi];
+    // When the vectorized epilogue is skipped due to iteration count
+    // check, then the resume value for the induction variable comes from
+    // the trip count of the main vector loop.
+    Value *EndValueBypassingVectorEpilogue = EPI.VectorTripCount;
+    if (OrigPhi == OldInduction) {
+      // We know what the end value is.
+      EndValue = VectorTripCount;
+    } else {
+      IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+      Type *StepType = II.getStep()->getType();
+      Instruction::CastOps CastOp =
+          CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
+      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
+      const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
+      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+      EndValue->setName("ind.end");
+
+      // Compute the end value for the bypass from vector iter count check.
+      B.SetInsertPoint(&(*VecEpilogIterCheck->getFirstInsertionPt()));
+      CastOp =
+          CastInst::getCastOpcode(EPI.VectorTripCount, true, StepType, true);
+      CRD = B.CreateCast(CastOp, EPI.VectorTripCount, StepType, "cast.crd");
+      EndValueBypassingVectorEpilogue =
+          emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+      EndValueBypassingVectorEpilogue->setName("ind.end");
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+
+    // Fix the scalar body counter (PHI node).
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    for (BasicBlock *BB : LoopBypassBlocks)
+      BCResumeVal->addIncoming(II.getStartValue(), BB);
+
+    BCResumeVal->setIncomingValueForBlock(VecEpilogIterCheck,
+                                          EndValueBypassingVectorEpilogue);
+
+    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+  }
+}
+
+BasicBlock *IMLAEEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
+    Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
+
+  assert(EPI.TripCount &&
+         "Expected trip count to have been safed in the first pass.");
+  assert(!isa<Instruction>(EPI.TripCount) ||
+         DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert) &&
+             "saved trip count does not dominate insertion point.");
+  Value *TC = EPI.TripCount;
+  IRBuilder<> Builder(Insert->getTerminator());
+  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
+
+  // Generate code to check if the loop's trip count is less than VF * UF of the
+  // vector epilogue loop.
+  auto P =
+      Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count,
+      ConstantInt::get(Count->getType(),
+                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
+      "min.epilog.iters.check");
+
+  ReplaceInstWithInst(
+      Insert->getTerminator(),
+      BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+
+  LoopBypassBlocks.push_back(Insert);
+  return Insert;
+}
+
+void IMLAEEpilogueLoop::printDebugTracesAtStart() {
+  LLVM_DEBUG({
+    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
+           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << ", Main Loop UF:" << EPI.MainLoopUF
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+  });
+}
+
+void IMLAEEpilogueLoop::printDebugTracesAtEnd() {
+  DEBUG_WITH_TYPE(VerboseDebug, {
+    dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
+  });
+}
+
 bool LoopVectorizationPlanner::getDecisionAndClampRange(
     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
@@ -8553,16 +9143,51 @@
     });
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM, BFI, PSI);
-    LVP.executePlan(LB, DT);
-    ++LoopsVectorized;
-
-    // Add metadata to disable runtime unrolling a scalar loop when there are
-    // no runtime checks about strides and memory. A scalar loop that is
-    // rarely used is not worth unrolling.
-    if (!LB.areSafetyChecksAdded())
-      DisableRuntimeUnroll = true;
+
+    // Consider vectorizing the epilogue too if it's profitable.
+    VectorizationFactor EpilogueVF =
+        CM.selectEpilogueVectorizationFactor(VF.Width.getFixedValue());
+    if (EpilogueVF.Width.isVector()) {
+
+      // The first pass vectorizes the main loop and creates a scalar epilogue
+      // to be vectorized by executing the plan (potentially with a different
+      // factor) again shortly afterwards.
+      EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
+                                        EpilogueVF.Width.getKnownMinValue(), 1);
+      IMLAEMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM,
+                            BFI, PSI);
+
+      LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+      LVP.executePlan(MainILV, DT);
+      ++LoopsVectorized;
+
+      simplifyLoop(L, DT, LI, SE, AC, nullptr, false);
+      formLCSSARecursively(*L, *DT, LI, SE);
+
+      // Second pass vectorizes the epilogue and adjusts the control flow
+      // edges from the first pass.
+      LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
+      EPI.MainLoopVF = EPI.EpilogueVF;
+      EPI.MainLoopUF = EPI.EpilogueUF;
+      IMLAEEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL,
+                                  &CM, BFI, PSI);
+      LVP.executePlan(EpilogILV, DT);
+      ++LoopsEpilogueVectorized;
+
+      if (!MainILV.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    } else {
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
+                             &LVL, &CM, BFI, PSI);
+      LVP.executePlan(LB, DT);
+      ++LoopsVectorized;
+
+      // Add metadata to disable runtime unrolling a scalar loop when there are
+      // no runtime checks about strides and memory. A scalar loop that is
+      // rarely used is not worth unrolling.
+      if (!LB.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+    }
 
     // Report the vectorization decision.
     ORE->emit([&]() {
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -0,0 +1,299 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias %cc, i32 signext %N) #0 {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AA1:%.*]] = bitcast float* [[AA:%.*]] to i8*
+; CHECK-NEXT:    [[BB3:%.*]] = bitcast float* [[BB:%.*]] to i8*
+; CHECK-NEXT:    [[CC6:%.*]] = bitcast float* [[CC:%.*]] to i8*
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[AA]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[BB]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr float, float* [[CC]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP45]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[BB3]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[AA1]], [[SCEVGEP78]]
+; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[CC6]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK12:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 48
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK12]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 48
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 20
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 24
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 28
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 36
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 40
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 44
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 16
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, <4 x float>* [[TMP33]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 20
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, <4 x float>* [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 24
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, <4 x float>* [[TMP37]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 28
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, <4 x float>* [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 32
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast float* [[TMP40]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, <4 x float>* [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 36
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float* [[TMP42]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, <4 x float>* [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 40
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, <4 x float>* [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 44
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 4
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast float* [[TMP62]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <4 x float>, <4 x float>* [[TMP63]], align 4
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 8
+; CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 12
+; CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 16
+; CHECK-NEXT:    [[TMP69:%.*]] = bitcast float* [[TMP68]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <4 x float>, <4 x float>* [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 20
+; CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 24
+; CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 28
+; CHECK-NEXT:    [[TMP75:%.*]] = bitcast float* [[TMP74]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP75]], align 4
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 32
+; CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 36
+; CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 40
+; CHECK-NEXT:    [[TMP81:%.*]] = bitcast float* [[TMP80]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, <4 x float>* [[TMP81]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 44
+; CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD35:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD24]]
+; CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD13]], [[WIDE_LOAD25]]
+; CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[WIDE_LOAD14]], [[WIDE_LOAD26]]
+; CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[WIDE_LOAD15]], [[WIDE_LOAD27]]
+; CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[WIDE_LOAD16]], [[WIDE_LOAD28]]
+; CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[WIDE_LOAD17]], [[WIDE_LOAD29]]
+; CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[WIDE_LOAD18]], [[WIDE_LOAD30]]
+; CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[WIDE_LOAD19]], [[WIDE_LOAD31]]
+; CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <4 x float> [[WIDE_LOAD20]], [[WIDE_LOAD32]]
+; CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <4 x float> [[WIDE_LOAD21]], [[WIDE_LOAD33]]
+; CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <4 x float> [[WIDE_LOAD22]], [[WIDE_LOAD34]]
+; CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <4 x float> [[WIDE_LOAD23]], [[WIDE_LOAD35]]
+; CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP109]], align 4
+; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 4
+; CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP111]], align 4
+; CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 8
+; CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP113]], align 4
+; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 12
+; CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP115]], align 4
+; CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 16
+; CHECK-NEXT:    [[TMP117:%.*]] = bitcast float* [[TMP116]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP117]], align 4
+; CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 20
+; CHECK-NEXT:    [[TMP119:%.*]] = bitcast float* [[TMP118]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP119]], align 4
+; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 24
+; CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP121]], align 4
+; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 28
+; CHECK-NEXT:    [[TMP123:%.*]] = bitcast float* [[TMP122]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP123]], align 4
+; CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 32
+; CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP92]], <4 x float>* [[TMP125]], align 4
+; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 36
+; CHECK-NEXT:    [[TMP127:%.*]] = bitcast float* [[TMP126]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP93]], <4 x float>* [[TMP127]], align 4
+; CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 40
+; CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP94]], <4 x float>* [[TMP129]], align 4
+; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, float* [[TMP96]], i32 44
+; CHECK-NEXT:    [[TMP131:%.*]] = bitcast float* [[TMP130]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP95]], <4 x float>* [[TMP131]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 48
+; CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOPID1:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF36:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2
+; CHECK-NEXT:    [[N_VEC37:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF36]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_BODY]] ]
+; CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX38]], 0
+; CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
+; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD41:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
+; CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
+; CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
+; CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD42:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
+; CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD41]], [[WIDE_LOAD42]]
+; CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
+; CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
+; CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT39]] = add i64 [[INDEX38]], 2
+; CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC37]]
+; CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_BODY]], [[LOOPID2:!llvm.loop !.*]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC37]]
+; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC37]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP145:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP146:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP145]], [[TMP146]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOPID3:!llvm.loop !.*]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %bb, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %cc, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %0, %1
+  %arrayidx4 = getelementptr inbounds float, float* %aa, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" }