Index: include/llvm/Transforms/Utils/LoopVersioning.h
===================================================================
--- include/llvm/Transforms/Utils/LoopVersioning.h
+++ include/llvm/Transforms/Utils/LoopVersioning.h
@@ -41,7 +41,7 @@
   /// object having no checks and we expect the user to add them.
   LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                  DominatorTree *DT, ScalarEvolution *SE,
-                 bool UseLAIChecks = true);
+                 bool UseLAIChecks = true, bool AssumeLoopSimplifyForm = false);
 
   /// \brief Performs the CFG manipulation part of versioning the loop including
   /// the DominatorTree and LoopInfo updates.
@@ -146,6 +146,9 @@
   LoopInfo *LI;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  // Flag to allow non simplify form loops in LoopVersioning.
+  // Minimal expectation is Loop should have a pre header & single exit block.
+  bool AssumeLoopSimplifyForm;
 };
 }
 
Index: lib/Transforms/Utils/LoopVersioning.cpp
===================================================================
--- lib/Transforms/Utils/LoopVersioning.cpp
+++ lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,11 +32,14 @@
 
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                                DominatorTree *DT, ScalarEvolution *SE,
-                               bool UseLAIChecks)
+                               bool UseLAIChecks, bool AssumeLoopSimplifyForm)
     : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
-      SE(SE) {
+      SE(SE), AssumeLoopSimplifyForm(AssumeLoopSimplifyForm) {
   assert(L->getExitBlock() && "No single exit block");
-  assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
+  if (AssumeLoopSimplifyForm)
+    assert(L->getLoopPreheader() && "No preheader in the loop");
+  else
+    assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
     setSCEVChecks(LAI.getPSE().getUnionPredicate());
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -215,6 +215,14 @@
     cl::desc("The maximum number of SCEV checks allowed with a "
              "vectorize(enable) pragma"));
 
+static cl::opt<bool> EnableEpilogVectorization(
+    "enable-epilog-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of epilog scalar loop."));
+
+static cl::opt<unsigned> MinWidthEpilogVectorization(
+    "min-width-epilog-vectorization", cl::init(16), cl::Hidden,
+    cl::desc("Minimum vector loop width to enable epilog vectorization."));
+
 /// Create an analysis remark that explains why vectorization failed
 ///
 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
@@ -406,20 +414,33 @@
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM)
+                      LoopVectorizationCostModel *CM,
+                      unsigned EpilogVectorLoopWidth = 1,
+                      std::function<const LoopAccessInfo &(Loop &)> *GetLAA = nullptr,
+                      AliasAnalysis *AA = nullptr)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+        AA(AA), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Induction(nullptr),
         OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
-        AddedSafetyChecks(false) {}
+        AddedSafetyChecks(false), EpilogVectorLoopWidth(EpilogVectorLoopWidth),
+        ScalarResumeValue(nullptr), AliasCheckResult(nullptr), GetLAA(GetLAA) {}
 
   // Perform the actual loop widening (vectorization).
-  void vectorize() {
+  void vectorize(Value *RemainderItrs = nullptr,
+                 Value *EpilogResumeValue = nullptr,
+                 PHINode *ACR = nullptr,
+                 PHINode *EpilogInduction = nullptr) {
+    AliasCheckResult = ACR;
     // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
+    createEmptyLoop(RemainderItrs, EpilogResumeValue,
+                    EpilogInduction);
     // Widen each instruction in the old loop to a new one in the new loop.
     vectorizeLoop();
+    // If multi version is required, generate next VF nersion code.
+    if (canCreateVectorEpilog())
+      createVectorEpilog();
+
   }
 
   // Return true if any runtime check is added.
@@ -448,7 +469,9 @@
       EdgeMaskCache;
 
   /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
+  void createEmptyLoop(Value *RemainderItrs = nullptr,
+                       Value *EpilogResumeValue = nullptr,
+                       PHINode *EpilogInduction = nullptr);
 
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
@@ -584,19 +607,24 @@
   Value *getOrCreateTripCount(Loop *NewLoop);
 
   /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+  Value *getOrCreateVectorTripCount(Loop *NewLoop, bool ReCreateVecTC = false);
 
   /// Emit a bypass check to see if the trip count would overflow, or we
   /// wouldn't have enough iterations to execute one vector loop.
-  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
+                                      unsigned NewVF = 0,
+                                      Value *KnownTripCount = nullptr);
   /// Emit a bypass check to see if the vector trip count is nonzero.
-  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
+  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass,
+                                  bool ReCreateVecTC = false);
+
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct.
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
   /// Emit bypass checks to check any memory assumptions we may have made.
-  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
+  BasicBlock* emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+  /// Emit bypass checks by checking runtime alias result.
+  BasicBlock* emitAliasResultCheck(Loop *L, BasicBlock *Bypass, Value *ACR);
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -611,6 +639,34 @@
   /// loop.
   void addMetadata(Instruction *To, Instruction *From);
 
+  /// Returns true if epilog vectorization is legal.
+  /// Legality is already performed on the loop while first vector
+  /// loop generation, now checking minimal stuff required for versioning.
+  bool canCreateVectorEpilog() {
+    // Epilog vectorization option should be enabled.
+    if (!EnableEpilogVectorization)
+      return false;
+    // Epilog vector should have a valid vector width.
+    if (EpilogVectorLoopWidth == 1)
+      return false;
+    // Loop should have exit block.
+    if (!OrigLoop->getExitBlock())
+      return false;
+    // Loop should have pre header.
+    if (!OrigLoop->getLoopPreheader())
+      return false;
+    return true;
+  }
+
+  // Transform for vector multi versioning.
+  void createVectorEpilog();
+
+  // Create alias check result phinode.
+  PHINode* createAliasCheckResultPHI();
+
+  // Checks feasiblity of vector multi versioning.
+  bool canCreateEpilogVectorLoop(LoopVectorizationLegality &LVL);
+
   /// \brief Similar to the previous function but it adds the metadata to a
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
@@ -737,12 +793,12 @@
   LoopInfo *LI;
   /// Dominator Tree.
   DominatorTree *DT;
-  /// Alias Analysis.
-  AliasAnalysis *AA;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
   /// Assumption Cache.
   AssumptionCache *AC;
   /// Interface to emit optimization remarks.
@@ -781,6 +837,8 @@
   BasicBlock *LoopVectorBody;
   /// The scalar loop body.
   BasicBlock *LoopScalarBody;
+  /// Alias check block.
+  BasicBlock* AliasCheckBlock;
   /// A list of all bypass blocks. The first block is the entry of the loop.
   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
@@ -824,6 +882,14 @@
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
+  // Epilog vector loop width.
+  unsigned EpilogVectorLoopWidth;
+  /// Scalar block resume value PHINode.
+  PHINode *ScalarResumeValue;
+  /// Alias check result.
+  PHINode *AliasCheckResult;
+  /// Loop access analysis.
+  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1581,7 +1647,7 @@
 public:
   LoopVectorizationLegality(
       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
-      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
+      const TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
       const TargetTransformInfo *TTI,
       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
@@ -1797,7 +1863,7 @@
   /// unrolling.
   PredicatedScalarEvolution &PSE;
   /// Target Library Info.
-  TargetLibraryInfo *TLI;
+  const TargetLibraryInfo *TLI;
   /// Target Transform Info
   const TargetTransformInfo *TTI;
   /// Dominator Tree.
@@ -1871,6 +1937,9 @@
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
     unsigned Cost;  // Cost of the loop with that width
+    VectorizationFactor() : Width(1), Cost(0) {}
+    VectorizationFactor(unsigned Width, unsigned Cost)
+        : Width(Width), Cost(Cost) {}
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
@@ -1878,6 +1947,11 @@
   /// possible.
   VectorizationFactor selectVectorizationFactor(bool OptForSize);
 
+  /// \return The next profitable vector factor after the input vector factor
+  /// If there is not any next profitable vector factor return VectorizationFactor
+  /// with scalar as next possible factor.
+  VectorizationFactor identifyNextProfitableVF(const unsigned IVF);
+
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
@@ -2198,6 +2272,8 @@
   SmallPtrSet<const Value *, 16> ValuesToIgnore;
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVF;
 };
 
 /// \brief This holds vectorization requirements that must be verified late in
@@ -3159,6 +3235,130 @@
   VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
+/// This function will generate PHINode for alias result check in original
+/// scalar loop's preheader block.
+PHINode* InnerLoopVectorizer::createAliasCheckResultPHI() {
+  // Return nullptr if no alias check created previously.
+  if (!AliasCheckBlock)
+    return nullptr;
+  auto &M = *LoopScalarPreHeader->getTerminator()->getFunction()->getParent();
+  auto &Ctx = M.getContext();
+  // Create PHINode for alias check result.
+  PHINode* ACR =
+      PHINode::Create(Type::getInt1Ty(Ctx), 3, "acr.val", ScalarResumeValue);
+  // From middle block set true incoming value.
+  ACR->addIncoming(Builder.getInt1(true), LoopMiddleBlock);
+  // From bypassing blocks set false incoming value.
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    ACR->addIncoming(Builder.getInt1(false), LoopBypassBlocks[I]);
+  return ACR;
+}
+
+/// Returns true if its feasible to generate vector multi version, else false.
+bool InnerLoopVectorizer::canCreateEpilogVectorLoop(LoopVectorizationLegality &LVL) {
+  // Loop bypass blocks should not be empty.
+  if (!LoopBypassBlocks.size()) {
+    DEBUG(dbgs() << "Loop bypass blocks are empty !\n");
+    return false;
+  }
+  // Unique induction phi node should exist.
+  if (!ScalarResumeValue) {
+    DEBUG(dbgs() << "Unique induction phi node missing !\n");
+    return false;
+  }
+  // Original Trip count & resume value type should be same.
+  if (getOrCreateTripCount(OrigLoop)->getType() != ScalarResumeValue->getType()) {
+    DEBUG(dbgs() << "Type mismatch for resume & tripcount.\n");
+    return false;
+  }
+  // Old Induction variable should exist.
+  if (!OldInduction) {
+    DEBUG(dbgs() << "Scalar loop does not has explicit induction variable.\n");
+    return false;
+  }
+  // Should be legal to vectorize.
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "MVer-LV: Not vectorizing: Cannot prove legality.\n");
+    return false;
+  }
+  return true;
+}
+
+///   This function will vectorize epilog by generating epilog vector version.
+///   In this function we generate a new loop. The new loop will contain
+///   the vectorized instructions with new vector factor.
+///
+///       [ ] <-- original loop iteration number check.
+///    /   |
+///   /    v
+///  |    [ ] <-- original vector loop bypass (may consist of multiple blocks
+///  |  /  |      having checks like alias checks & scev checks.)
+///  | /   v
+///  ||   [ ]     <-- original vector pre header.
+///  |/    |
+///  |     v
+///  |    [  ] \
+///  |    [  ]_|   <-- original vector loop.
+///  |     |
+///  |     v
+///  |   -[ ]   <--- original middle-block.
+///  |  /  |
+///  | /   v
+///  |----[ ] <-- new version loop iteration number check.
+///   | /  |
+///   |/   v
+///   /   [ ] <-- Previously executed alias result check.
+///  || /  |
+///  ||/   v
+///  |/   [ ]     <-- new vector version pre header.
+///  ||    |
+///  ||    v
+///  ||   [  ] \
+///  ||   [  ]_|   <-- new vector loop.
+///  ||    |
+///  ||    v
+///  ||  -[ ]   <--- new middle-block.
+///  || /  |
+///  | /   v
+///  -|- >[ ]     <--- scalar preheader.
+///   |    |
+///   |    v
+///   |   [ ] \
+///   |   [ ]_|   <-- scalar loop to handle remainder.
+///    \   |
+///     \  v
+///      >[ ]     <-- exit block.
+///   ...
+///
+void InnerLoopVectorizer::createVectorEpilog() {
+  // Create instance of LoopVectorizationLegality.
+  Function *F = LoopScalarPreHeader->getParent();
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizeHints Hints(OrigLoop, true, *ORE);
+  LoopVectorizationLegality MVLVL(OrigLoop, PSE, DT, TLI, AA, F, TTI, GetLAA,
+                                  LI, ORE, &Requirements, &Hints);
+  // Check if its legal to do vector multi versioning.
+  if (!canCreateEpilogVectorLoop(MVLVL)) {
+    DEBUG(dbgs() << "Epilog vectorization unable to prove legality !\n");
+    return;
+  }
+  DEBUG(dbgs() << "Epilog vectorization is beneficial with width : "
+               << EpilogVectorLoopWidth << " in Function: "
+               << LoopScalarPreHeader->getParent()->getName() << "\n");
+  // Create PHINode for alias check execution result.
+  AliasCheckResult = createAliasCheckResultPHI();
+  // Identify remaining number of iterations.
+  IRBuilder<> ResumeBuilder(&*LoopScalarPreHeader->getTerminator());
+  Value *RemainderItrs = ResumeBuilder.CreateSub(getOrCreateTripCount(OrigLoop),
+                                                 ScalarResumeValue, "rem.itrs");
+  // Create instance of inner loop vectorizer & vectorize.
+  InnerLoopVectorizer LB(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                         EpilogVectorLoopWidth, 1, &MVLVL, Cost);
+  LB.vectorize(RemainderItrs, ScalarResumeValue,
+               AliasCheckResult, OldInduction);
+}
+
+
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
                                                       Value *End, Value *Step,
                                                       Instruction *DL) {
@@ -3236,8 +3436,9 @@
   return TripCount;
 }
 
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
-  if (VectorTripCount)
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L,
+                                                       bool ReCreateVecTC) {
+  if (!ReCreateVecTC && VectorTripCount)
     return VectorTripCount;
 
   Value *TC = getOrCreateTripCount(L);
@@ -3268,16 +3469,20 @@
   return VectorTripCount;
 }
 
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
-                                                         BasicBlock *Bypass) {
-  Value *Count = getOrCreateTripCount(L);
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(
+    Loop *L, BasicBlock *Bypass, unsigned NewVF, Value *KnownTripCount) {
+  // If its a known trip count then use it.
+  Value *Count = KnownTripCount ? KnownTripCount : getOrCreateTripCount(L);
+  // Minimum iteration count.
+  unsigned MinItrsCount = NewVF ? NewVF : (VF * UF);
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
   // Generate code to check that the loop's trip count that we computed by
   // adding one to the backedge-taken count will not overflow.
   Value *CheckMinIters = Builder.CreateICmpULT(
-      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+      Count, ConstantInt::get(Count->getType(), MinItrsCount),
+      "min.iters.check");
 
   BasicBlock *NewBB =
       BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
@@ -3293,8 +3498,9 @@
 }
 
 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
-                                                     BasicBlock *Bypass) {
-  Value *TC = getOrCreateVectorTripCount(L);
+                                                     BasicBlock *Bypass,
+                                                     bool ReCreateVecTC) {
+  Value *TC = getOrCreateVectorTripCount(L, ReCreateVecTC);
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
@@ -3347,7 +3553,8 @@
   AddedSafetyChecks = true;
 }
 
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock* InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+                                                      BasicBlock *Bypass) {
   BasicBlock *BB = L->getLoopPreheader();
 
   // Generate the code that checks in runtime if arrays overlap. We put the
@@ -3358,7 +3565,7 @@
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
   if (!MemRuntimeCheck)
-    return;
+    return nullptr;
 
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
@@ -3377,11 +3584,44 @@
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
-                                           PSE.getSE());
+                                           PSE.getSE(), true, true);
+  LVer->prepareNoAliasMetadata();
+  // Return alias check basic block.
+  return BB;
+}
+
+/// Generate a check for alias check result, if alias check is executed
+/// with no-alias result then jump to epilog loop min iteration check
+/// else jump to scalar loop.
+BasicBlock *InnerLoopVectorizer::emitAliasResultCheck(Loop *L,
+                                                      BasicBlock *Bypass,
+                                                      Value *ACR) {
+
+  BasicBlock *BB = L->getLoopPreheader();
+  // Create a new block containing the memory check.
+  BB->setName("alias.result.chk");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(BB->getTerminator(),
+                      BranchInst::Create(NewBB, Bypass, ACR));
+  LoopBypassBlocks.push_back(BB);
+
+  // We currently don't use LoopVersioning for the actual loop cloning but we
+  // still use it to add the noalias metadata.
+  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+                                           PSE.getSE(), true, true);
   LVer->prepareNoAliasMetadata();
+  return BB;
 }
 
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createEmptyLoop(Value *RemainderItrs,
+                                          Value *EpilogResumeValue,
+                                          PHINode *EpilogInduction) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3431,7 +3671,8 @@
   //   - counts from zero, stepping by one
   //   - is the size of the widest induction variable type
   // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
+  OldInduction =
+      EpilogInduction ? EpilogInduction : Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
@@ -3462,14 +3703,24 @@
 
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
 
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  emitMinimumIterationCountCheck(Lp, ScalarPH);
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  if (RemainderItrs) {
+    // Create minimum iteration check against remaining iterations.
+    emitMinimumIterationCountCheck(Lp, ScalarPH, VF, RemainderItrs);
+    // Now, compare the new count to zero.
+    // If it is zero skip the vector loop and jump to the scalar loop.
+    // Enfore recreation of vector trip count.
+    emitVectorLoopEnteredCheck(Lp, ScalarPH, true);
+  } else {
+    // We need to test whether the backedge-taken count is uint##_max.
+    // Adding one to it will cause overflow and an incorrect loop trip count
+    // in the vector body. In case of overflow we want to directly jump to the
+    // scalar remainder loop.
+    emitMinimumIterationCountCheck(Lp, ScalarPH);
+    // Now, compare the new count to zero. If it is zero skip the vector loop
+    // and jump to the scalar loop.
+    emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  }
+
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
   emitSCEVChecks(Lp, ScalarPH);
@@ -3477,17 +3728,27 @@
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  emitMemRuntimeChecks(Lp, ScalarPH);
-
+  // If alias check is already executed decide based on its result,
+  // else generate runtime alias check.
+  BasicBlock *RuntimeCheckBlock = nullptr;
+  if (AliasCheckResult) {
+    RuntimeCheckBlock = emitAliasResultCheck(Lp, ScalarPH, AliasCheckResult);
+  } else {
+    RuntimeCheckBlock = emitMemRuntimeChecks(Lp, ScalarPH);
+  }
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
+  if (EpilogResumeValue) {
+    Induction = createInductionVariable(Lp, EpilogResumeValue,
+                               CountRoundDown, Step,
+                               getDebugLocFromInstOrOperands(OldInduction));
+  } else {
+    Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                                getDebugLocFromInstOrOperands(OldInduction));
+  }
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3499,6 +3760,7 @@
   // This variable saves the new starting index for the scalar loop. It is used
   // to test if there are any tail iterations left once the vector loop has
   // completed.
+  ScalarResumeValue = nullptr;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (auto &InductionEntry : *List) {
     PHINode *OrigPhi = InductionEntry.first;
@@ -3531,11 +3793,22 @@
 
     // The old induction's phi node in the scalar body needs the truncated
     // value.
-    for (BasicBlock *BB : LoopBypassBlocks)
-      BCResumeVal->addIncoming(II.getStartValue(), BB);
+    if (EpilogResumeValue) {
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        BCResumeVal->addIncoming(EpilogResumeValue, LoopBypassBlocks[I]);
+    } else {
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+    }
     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    ScalarResumeValue = BCResumeVal;
   }
 
+  // NOTE: Currently multiple induction variables not supported as multiple
+  // induction variable incurs multiple checks which may not be profitable.
+  if (List->size() > 1)
+    ScalarResumeValue = nullptr;
+
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
@@ -3555,6 +3828,7 @@
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
+  AliasCheckBlock = RuntimeCheckBlock;
 
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
@@ -5025,8 +5299,10 @@
   PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
-         "Entry does not dominate exit.");
+  // With epilog vectorization not all loop bypass blocks dominates exit blocks.
+  if (!EnableEpilogVectorization)
+    assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+           "Entry does not dominate exit.");
 
   // We don't predicate stores by this point, so the vector body should be a
   // single loop.
@@ -5035,7 +5311,14 @@
   DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
-  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+  // Change immedeate dominator only when:
+  // 1) When epilog vectorization is disabled.
+  // 2) When epilog vectorization is enabled and EpilogVectorLoopWidth
+  //    is not 1, this is when generating original vector version.
+  //    EpilogVectorLoopWidth will be 1 while generating epilog vector
+  //    version.
+  if (!EnableEpilogVectorization || EpilogVectorLoopWidth != 1)
+    DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
 
   DEBUG(DT->verifyDomTree());
 }
@@ -6128,6 +6411,31 @@
   }
 }
 
+/// Identify epilog vector loop width from profitable VF's.
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::identifyNextProfitableVF(const unsigned Width) {
+  // Find next VF.
+  unsigned VF = 1;
+  unsigned Cost = 0;
+  bool hasMultipleInductionVariables = (Legal->getInductionVars()->size() != 1);
+  // Check the loop vector width should be more than min width threshold.
+  if (Width < MinWidthEpilogVectorization)
+    return VectorizationFactor(VF, Cost);
+  // Return if epilog vectorization option not provided.
+  // NOTE: Currently multiple induction variables not supported as multiple
+  // induction variable incurs multiple checks which may not be profitable.
+  if (!EnableEpilogVectorization || hasMultipleInductionVariables)
+    return VectorizationFactor(VF, Cost);
+  // Identify next profitable VF.
+  for (auto &NextVF : ProfitableVF)
+    // NOTE: VF's in ProfitableVF is in accesding order.
+    if ((NextVF.Width < Width) && (VF == 1 || NextVF.Cost < Cost)) {
+      VF = NextVF.Width;
+      Cost = NextVF.Cost;
+    }
+  return VectorizationFactor(VF, Cost);
+}
+
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
@@ -6276,6 +6584,9 @@
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
+    // If profitable add it to ProfitableVF list.
+    if (VectorCost < ScalarCost)
+      ProfitableVF.push_back(VectorizationFactor(i, VectorCost));
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -7671,6 +7982,9 @@
   }
 
   using namespace ore;
+  // Identify multi version loop width
+  const LoopVectorizationCostModel::VectorizationFactor EpilogVF =
+      CM.identifyNextProfitableVF(VF.Width);
   if (!VectorizeLoop) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
@@ -7686,7 +8000,7 @@
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
+                           &LVL, &CM, EpilogVF.Width, GetLAA, AA);
     LB.vectorize();
     ++LoopsVectorized;