Index: include/llvm/Transforms/Utils/LoopVersioning.h
===================================================================
--- include/llvm/Transforms/Utils/LoopVersioning.h
+++ include/llvm/Transforms/Utils/LoopVersioning.h
@@ -41,7 +41,7 @@
   /// object having no checks and we expect the user to add them.
   LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                  DominatorTree *DT, ScalarEvolution *SE,
-                 bool UseLAIChecks = true);
+                 bool UseLAIChecks = true, bool AssumeLoopSimplifyForm = false);
 
   /// \brief Performs the CFG manipulation part of versioning the loop including
   /// the DominatorTree and LoopInfo updates.
@@ -146,6 +146,9 @@
   LoopInfo *LI;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  // Flag to allow non simplify form loops in LoopVersioning.
+  // Minimal expectation is Loop should have a pre header & single exit block.
+  bool AssumeLoopSimplifyForm;
 };
 }
 
Index: lib/Transforms/Utils/LoopVersioning.cpp
===================================================================
--- lib/Transforms/Utils/LoopVersioning.cpp
+++ lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,11 +32,14 @@
 
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                                DominatorTree *DT, ScalarEvolution *SE,
-                               bool UseLAIChecks)
+                               bool UseLAIChecks, bool AssumeLoopSimplifyForm)
     : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT),
-      SE(SE) {
+      SE(SE), AssumeLoopSimplifyForm(AssumeLoopSimplifyForm) {
   assert(L->getExitBlock() && "No single exit block");
-  assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
+  if (AssumeLoopSimplifyForm)
+    assert(L->getLoopPreheader() && "No preheader in the loop");
+  else
+    assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
     setSCEVChecks(LAI.getPSE().getUnionPredicate());
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -215,6 +215,14 @@
     cl::desc("The maximum number of SCEV checks allowed with a "
              "vectorize(enable) pragma"));
 
+static cl::opt<bool> EnableEpilogVectorization(
+    "enable-epilog-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of epilog scalar loop."));
+
+static cl::opt<unsigned> MinWidthEpilogVectorization(
+    "min-width-epilog-vectorization", cl::init(16), cl::Hidden,
+    cl::desc("Minimum vector loop width to enable epilog vectorization."));
+
 /// Create an analysis remark that explains why vectorization failed
 ///
 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
@@ -386,6 +394,67 @@
                            : ConstantFP::get(Ty, C);
 }
 
+class EpilogLoopInfo {
+public:
+  EpilogLoopInfo(Value *RemainderItrs, Value *EpilogResumeValue,
+                 PHINode *EpilogInduction, PHINode *ACR)
+      : RemainderItrs(RemainderItrs), EpilogResumeValue(EpilogResumeValue),
+        EpilogInduction(EpilogInduction), ACR(ACR) {}
+
+  EpilogLoopInfo()
+      : RemainderItrs(nullptr), EpilogResumeValue(nullptr),
+        EpilogInduction(nullptr), ACR(nullptr) {}
+
+  Value *getRemainderItrs() { return RemainderItrs; }
+  Value *getEpilogResumeValue() { return EpilogResumeValue; }
+  PHINode *getEpilogInduction() { return EpilogInduction; }
+  PHINode *getAliasCheckResult() { return ACR; }
+
+private:
+  Value *RemainderItrs;
+  Value *EpilogResumeValue;
+  PHINode *EpilogInduction;
+  PHINode *ACR;
+};
+
+class EpilogVectorizationInfo {
+public:
+  EpilogVectorizationInfo()
+      : ELI(nullptr), EpilogVectorLoopWidth(1), EpilogVecProfitable(false) {}
+  EpilogVectorizationInfo(EpilogLoopInfo *ELI, unsigned Width, bool EVP)
+      : ELI(ELI), EpilogVectorLoopWidth(Width), EpilogVecProfitable(EVP) {}
+
+  void setEpilogVectorizationInfo(unsigned V, bool _EpilogVecProfitable) {
+    EpilogVectorLoopWidth = V;
+    EpilogVecProfitable = _EpilogVecProfitable;
+  }
+
+  unsigned getEpilogVectorLoopWidth() { return EpilogVectorLoopWidth; }
+
+  Value *getRemainderItrs() { return ELI ? ELI->getRemainderItrs() : nullptr; }
+
+  Value *getEpilogResumeValue() {
+    return ELI ? ELI->getEpilogResumeValue() : nullptr;
+  }
+
+  PHINode *getEpilogInduction() {
+    return ELI ? ELI->getEpilogInduction() : nullptr;
+  }
+
+  PHINode *getAliasCheckResult() {
+    return ELI ? ELI->getAliasCheckResult() : nullptr;
+  }
+
+  bool hasELI() { return ELI ? true : false; }
+
+  bool isEpilogVecProfitable() { return EpilogVecProfitable; }
+
+private:
+  EpilogLoopInfo *ELI;
+  unsigned EpilogVectorLoopWidth;
+  bool EpilogVecProfitable;
+};
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -408,13 +477,14 @@
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM)
+                      LoopVectorizationCostModel *CM,
+                      const EpilogVectorizationInfo &EVI)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Induction(nullptr),
         OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
-        AddedSafetyChecks(false) {}
+        AddedSafetyChecks(false), ScalarResumeValue(nullptr), EVI(EVI) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize() {
@@ -422,6 +492,9 @@
     createEmptyLoop();
     // Widen each instruction in the old loop to a new one in the new loop.
     vectorizeLoop();
+    // Check for epilog vectorization feasiblity:
+    if (canCreateVectorEpilog())
+      createVectorEpilog();
   }
 
   // Return true if any runtime check is added.
@@ -588,7 +661,7 @@
   Value *getOrCreateTripCount(Loop *NewLoop);
 
   /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+  Value *getOrCreateVectorTripCount(Loop *NewLoop, bool ReCreateVecTC = false);
 
   /// Emit a bypass check to see if the trip count would overflow, or we
   /// wouldn't have enough iterations to execute one vector loop.
@@ -599,8 +672,9 @@
   /// had to make are correct.
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
   /// Emit bypass checks to check any memory assumptions we may have made.
-  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
+  BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+  /// Emit bypass checks by checking runtime alias result.
+  BasicBlock *emitAliasResultCheck(Loop *L, BasicBlock *Bypass);
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -615,6 +689,45 @@
   /// loop.
   void addMetadata(Instruction *To, Instruction *From);
 
+  /// Returns true if epilog vectorization is legal & profitable.
+  /// Legality is already performed on the loop while first vector
+  /// loop generation, now checking minimal stuff required for epilog
+  /// vectorization.
+  bool canCreateVectorEpilog() {
+    // Epilog vectorization option should be enabled.
+    if (!EnableEpilogVectorization)
+      return false;
+    // Identified epilog vector width should be a valid vector width.
+    if (EVI.getEpilogVectorLoopWidth() == 1)
+      return false;
+    // Loop should have exit block.
+    if (!OrigLoop->getExitBlock())
+      return false;
+    // Loop should have pre header.
+    if (!OrigLoop->getLoopPreheader())
+      return false;
+    // Unique induction phi node should exist.
+    if (!ScalarResumeValue)
+      return false;
+    // Loop bypass blocks should not be empty.
+    if (!LoopBypassBlocks.size())
+      return false;
+    // Original Trip count & resume value type should be same.
+    if (getOrCreateTripCount(OrigLoop)->getType() !=
+        ScalarResumeValue->getType())
+      return false;
+    // Old Induction variable should exist.
+    if (!OldInduction)
+      return false;
+    return true;
+  }
+
+  // Transform for epilog vectorization.
+  void createVectorEpilog();
+
+  // Create alias check result phinode.
+  PHINode *createAliasCheckResultPHI();
+
   /// \brief Similar to the previous function but it adds the metadata to a
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
@@ -785,6 +898,8 @@
   BasicBlock *LoopVectorBody;
   /// The scalar loop body.
   BasicBlock *LoopScalarBody;
+  /// Alias check block.
+  BasicBlock *AliasCheckBlock;
   /// A list of all bypass blocks. The first block is the entry of the loop.
   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
@@ -828,6 +943,10 @@
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
+  /// Scalar block resume value PHINode.
+  PHINode *ScalarResumeValue;
+  // Epilog vectorization info.
+  EpilogVectorizationInfo EVI;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -840,7 +959,7 @@
                     LoopVectorizationLegality *LVL,
                     LoopVectorizationCostModel *CM)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
-                            UnrollFactor, LVL, CM) {}
+                            UnrollFactor, LVL, CM, EpilogVectorizationInfo()) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -1577,7 +1696,7 @@
 public:
   LoopVectorizationLegality(
       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
-      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
+      const TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
       const TargetTransformInfo *TTI,
       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
@@ -1785,7 +1904,7 @@
   /// unrolling.
   PredicatedScalarEvolution &PSE;
   /// Target Library Info.
-  TargetLibraryInfo *TLI;
+  const TargetLibraryInfo *TLI;
   /// Target Transform Info
   const TargetTransformInfo *TTI;
   /// Dominator Tree.
@@ -1859,6 +1978,9 @@
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
     unsigned Cost;  // Cost of the loop with that width
+    VectorizationFactor() : Width(1), Cost(0) {}
+    VectorizationFactor(unsigned Width, unsigned Cost)
+        : Width(Width), Cost(Cost) {}
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
@@ -1866,6 +1988,13 @@
   /// possible.
   VectorizationFactor selectVectorizationFactor(bool OptForSize);
 
+  /// \return The next profitable vector factor after the input vector factor
+  /// If there is not any next profitable vector factor return
+  /// VectorizationFactor
+  /// with scalar as next possible factor.
+  VectorizationFactor identifyNextProfitableVF(const unsigned IVF,
+                                               bool OptForSize);
+
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
@@ -2186,6 +2315,8 @@
   SmallPtrSet<const Value *, 16> ValuesToIgnore;
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVF;
 };
 
 /// \brief This holds vectorization requirements that must be verified late in
@@ -3188,6 +3319,90 @@
   VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
+/// This function will generate PHINode for alias result check in original
+/// scalar loop's preheader block.
+PHINode *InnerLoopVectorizer::createAliasCheckResultPHI() {
+  // Return nullptr if no alias check created previously.
+  if (!AliasCheckBlock)
+    return nullptr;
+  auto &M = *LoopScalarPreHeader->getTerminator()->getFunction()->getParent();
+  auto &Ctx = M.getContext();
+  // Create PHINode for alias check result.
+  PHINode *ACR =
+      PHINode::Create(Type::getInt1Ty(Ctx), 3, "acr.val", ScalarResumeValue);
+  // From middle block set true incoming value.
+  ACR->addIncoming(Builder.getInt1(true), LoopMiddleBlock);
+  // From bypassing blocks set false incoming value.
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    ACR->addIncoming(Builder.getInt1(false), LoopBypassBlocks[I]);
+  return ACR;
+}
+
+///   This function will vectorize epilog by generating epilog vector version.
+///   In this function we generate a new loop. The new loop will contain
+///   the vectorized instructions with new vector factor.
+///
+///       [ ] <-- original loop iteration number check.
+///    /   |
+///   /    v
+///  |    [ ] <-- original vector loop bypass (may consist of multiple blocks
+///  |  /  |      having checks like alias checks & scev checks.)
+///  | /   v
+///  ||   [ ]     <-- original vector pre header.
+///  |/    |
+///  |     v
+///  |    [  ] \
+///  |    [  ]_|   <-- original vector loop.
+///  |     |
+///  |     v
+///  |   -[ ]   <--- original middle-block.
+///  |  /  |
+///  | /   v
+///  |----[ ] <-- new version loop iteration number check.
+///   | /  |
+///   |/   v
+///   /   [ ] <-- Previously executed alias result check.
+///  || /  |
+///  ||/   v
+///  |/   [ ]     <-- new vector version pre header.
+///  ||    |
+///  ||    v
+///  ||   [  ] \
+///  ||   [  ]_|   <-- new vector loop.
+///  ||    |
+///  ||    v
+///  ||  -[ ]   <--- new middle-block.
+///  || /  |
+///  | /   v
+///  -|- >[ ]     <--- scalar preheader.
+///   |    |
+///   |    v
+///   |   [ ] \
+///   |   [ ]_|   <-- scalar loop to handle remainder.
+///    \   |
+///     \  v
+///      >[ ]     <-- exit block.
+///   ...
+///
+void InnerLoopVectorizer::createVectorEpilog() {
+  DEBUG(dbgs() << "Epilog vectorization is beneficial with width : "
+               << EVI.getEpilogVectorLoopWidth() << " in Function: "
+               << LoopScalarPreHeader->getParent()->getName() << "\n");
+  // Create PHINode for alias check execution result.
+  PHINode *AliasCheckResult = createAliasCheckResultPHI();
+  // Identify remaining number of iterations.
+  IRBuilder<> ResumeBuilder(&*LoopScalarPreHeader->getTerminator());
+  Value *RemainderItrs = ResumeBuilder.CreateSub(getOrCreateTripCount(OrigLoop),
+                                                 ScalarResumeValue, "rem.itrs");
+  // Create instance of inner loop vectorizer & vectorize.
+  EpilogLoopInfo NewELI(RemainderItrs, ScalarResumeValue, OldInduction,
+                        AliasCheckResult);
+  InnerLoopVectorizer LB(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                         EVI.getEpilogVectorLoopWidth(), /*EpilogUF*/ 1, Legal,
+                         Cost, EpilogVectorizationInfo(&NewELI, 1, true));
+  LB.vectorize();
+}
+
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
                                                       Value *End, Value *Step,
                                                       Instruction *DL) {
@@ -3265,8 +3480,9 @@
   return TripCount;
 }
 
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
-  if (VectorTripCount)
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L,
+                                                       bool ReCreateVecTC) {
+  if (!ReCreateVecTC && VectorTripCount)
     return VectorTripCount;
 
   Value *TC = getOrCreateTripCount(L);
@@ -3299,7 +3515,9 @@
 
 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                                          BasicBlock *Bypass) {
-  Value *Count = getOrCreateTripCount(L);
+  // If its a known trip count then use it.
+  Value *Count =
+      EVI.hasELI() ? EVI.getRemainderItrs() : getOrCreateTripCount(L);
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
@@ -3323,7 +3541,7 @@
 
 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
                                                      BasicBlock *Bypass) {
-  Value *TC = getOrCreateVectorTripCount(L);
+  Value *TC = getOrCreateVectorTripCount(L, (EVI.hasELI() ? true : false));
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
@@ -3376,7 +3594,8 @@
   AddedSafetyChecks = true;
 }
 
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+                                                      BasicBlock *Bypass) {
   BasicBlock *BB = L->getLoopPreheader();
 
   // Generate the code that checks in runtime if arrays overlap. We put the
@@ -3387,7 +3606,7 @@
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
   if (!MemRuntimeCheck)
-    return;
+    return nullptr;
 
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
@@ -3406,8 +3625,38 @@
   // We currently don't use LoopVersioning for the actual loop cloning but we
   // still use it to add the noalias metadata.
   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
-                                           PSE.getSE());
+                                           PSE.getSE(), true, true);
+  LVer->prepareNoAliasMetadata();
+  // Return alias check basic block.
+  return BB;
+}
+
+/// Generate a check for alias check result, if alias check is executed
+/// with no-alias result then jump to epilog loop min iteration check
+/// else jump to scalar loop.
+BasicBlock *InnerLoopVectorizer::emitAliasResultCheck(Loop *L,
+                                                      BasicBlock *Bypass) {
+
+  BasicBlock *BB = L->getLoopPreheader();
+  // Create a new block containing the memory check.
+  BB->setName("alias.result.chk");
+  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+  // Update dominator tree immediately if the generated block is a
+  // LoopBypassBlock because SCEV expansions to generate loop bypass
+  // checks may query it before the current function is finished.
+  DT->addNewBlock(NewBB, BB);
+  if (L->getParentLoop())
+    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+  ReplaceInstWithInst(
+      BB->getTerminator(),
+      BranchInst::Create(NewBB, Bypass, EVI.getAliasCheckResult()));
+  LoopBypassBlocks.push_back(BB);
+  // We currently don't use LoopVersioning for the actual loop cloning but we
+  // still use it to add the noalias metadata.
+  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+                                           PSE.getSE(), true, true);
   LVer->prepareNoAliasMetadata();
+  return BB;
 }
 
 void InnerLoopVectorizer::createEmptyLoop() {
@@ -3460,7 +3709,8 @@
   //   - counts from zero, stepping by one
   //   - is the size of the widest induction variable type
   // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
+  OldInduction =
+      EVI.hasELI() ? EVI.getEpilogInduction() : Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
@@ -3506,17 +3756,29 @@
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  emitMemRuntimeChecks(Lp, ScalarPH);
-
+  // If alias check is already executed decide based on its result,
+  // else generate runtime alias check.
+  BasicBlock *RuntimeCheckBlock = nullptr;
+  if (EVI.getAliasCheckResult()) {
+    RuntimeCheckBlock = emitAliasResultCheck(Lp, ScalarPH);
+  } else {
+    RuntimeCheckBlock = emitMemRuntimeChecks(Lp, ScalarPH);
+  }
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
+  Value *EpilogResumeValue = EVI.getEpilogResumeValue();
+  if (EpilogResumeValue) {
+    Induction =
+        createInductionVariable(Lp, EpilogResumeValue, CountRoundDown, Step,
+                                getDebugLocFromInstOrOperands(OldInduction));
+  } else {
+    Induction =
+        createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                                getDebugLocFromInstOrOperands(OldInduction));
+  }
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3528,6 +3790,7 @@
   // This variable saves the new starting index for the scalar loop. It is used
   // to test if there are any tail iterations left once the vector loop has
   // completed.
+  ScalarResumeValue = nullptr;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (auto &InductionEntry : *List) {
     PHINode *OrigPhi = InductionEntry.first;
@@ -3560,11 +3823,22 @@
 
     // The old induction's phi node in the scalar body needs the truncated
     // value.
-    for (BasicBlock *BB : LoopBypassBlocks)
-      BCResumeVal->addIncoming(II.getStartValue(), BB);
+    if (EpilogResumeValue) {
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        BCResumeVal->addIncoming(EpilogResumeValue, LoopBypassBlocks[I]);
+    } else {
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+    }
     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    ScalarResumeValue = BCResumeVal;
   }
 
+  // NOTE: Currently multiple induction variables not supported as multiple
+  // induction variable incurs multiple checks which may not be profitable.
+  if (List->size() > 1)
+    ScalarResumeValue = nullptr;
+
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
@@ -3584,6 +3858,7 @@
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
+  AliasCheckBlock = RuntimeCheckBlock;
 
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
@@ -5021,8 +5296,10 @@
   PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
-         "Entry does not dominate exit.");
+  // With epilog vectorization not all loop bypass blocks dominates exit blocks.
+  if (!EnableEpilogVectorization)
+    assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+           "Entry does not dominate exit.");
 
   // We don't predicate stores by this point, so the vector body should be a
   // single loop.
@@ -5031,7 +5308,16 @@
   DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
-  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+  // Change immediate dominator only when:
+  // 1) When epilog vectorization option is disabled.
+  // 2) When epilog vectorization option is enabled & not profitable.
+  // 3) When epilog vectorization option is enabled & profitable
+  //    for the given loop, only update immediate dominator for
+  //    first vector loop.
+  bool EpilogVecProfitable = EVI.isEpilogVecProfitable();
+  if (!EnableEpilogVectorization || !EpilogVecProfitable ||
+      (EpilogVecProfitable && EVI.getEpilogVectorLoopWidth() != 1))
+    DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
 
   DEBUG(DT->verifyDomTree());
 }
@@ -6129,6 +6415,35 @@
   }
 }
 
+/// Identify epilog vector loop width from profitable VF's.
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::identifyNextProfitableVF(const unsigned Width,
+                                                     bool OptForSize) {
+  // Find next VF.
+  unsigned VF = 1;
+  unsigned Cost = 0;
+  bool hasMultipleInductionVariables = (Legal->getInductionVars()->size() != 1);
+  // Return if epilog vectorization is disabled.
+  // Return if OptForSize is enabled
+  if (!EnableEpilogVectorization || OptForSize)
+    return VectorizationFactor(VF, Cost);
+  // NOTE: Currently multiple induction variables not supported as multiple
+  // induction variable incurs multiple checks which may not be profitable.
+  if (hasMultipleInductionVariables)
+    return VectorizationFactor(VF, Cost);
+  // Check the loop vector width should be more than min width threshold.
+  if (Width < MinWidthEpilogVectorization)
+    return VectorizationFactor(VF, Cost);
+  // Identify next profitable VF.
+  for (auto &NextVF : ProfitableVF)
+    // NOTE: VF's in ProfitableVF is in accesding order.
+    if ((NextVF.Width < Width) && (VF == 1 || NextVF.Cost < Cost)) {
+      VF = NextVF.Width;
+      Cost = NextVF.Cost;
+    }
+  return VectorizationFactor(VF, Cost);
+}
+
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
@@ -6277,6 +6592,9 @@
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
+    // If profitable add it to ProfitableVF list.
+    if (VectorCost < ScalarCost)
+      ProfitableVF.push_back(VectorizationFactor(i, VectorCost));
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -7672,6 +7990,12 @@
   }
 
   using namespace ore;
+  // Identify epilog vector loop width
+  const LoopVectorizationCostModel::VectorizationFactor EpilogVF =
+      CM.identifyNextProfitableVF(VF.Width, OptForSize);
+  // Check profitablity of epilog vectorization.
+  bool EpilogVecProfitable = (EpilogVF.Width == 1) ? 0 : 1;
+  // Switch indicates widening of epilog loop.
   if (!VectorizeLoop) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
@@ -7686,8 +8010,9 @@
               << NV("InterleaveCount", IC) << ")");
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
+    InnerLoopVectorizer LB(
+        L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM,
+        EpilogVectorizationInfo(nullptr, EpilogVF.Width, EpilogVecProfitable));
     LB.vectorize();
     ++LoopsVectorized;
 
Index: test/Transforms/LoopVectorize/epilogvec1.ll
===================================================================
--- test/Transforms/LoopVectorize/epilogvec1.ll
+++ test/Transforms/LoopVectorize/epilogvec1.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -S -enable-epilog-vectorization -debug-only=loop-vectorize -loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; Test to check epilog loop vectorization.
+; CHECK: LV: Found a vectorizable loop (16)
+; CHECK: Epilog vectorization is beneficial with width : 8
+; CHECK-LABEL: @foo(
+; CHECK-LABEL: vector.body:
+; CHECK: <16 x i8>
+; CHECK: scalar.ph:
+; CHECK-NEXT: %acr.val = phi i1 [ true, %middle.block ], [ false, %for.body.preheader ], [ false, %min.iters.checked ], [ false, %vector.memcheck ]
+; CHECK-LABEL: vector.body10:
+; CHECK: <8 x i8>
+; CHECK-LABEL: for.body:
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture readonly %C, i32 %len) local_unnamed_addr #0 {
+entry:
+  %cmp14 = icmp sgt i32 %len, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %len to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx6, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
Index: test/Transforms/LoopVectorize/epilogvec2.ll
===================================================================
--- test/Transforms/LoopVectorize/epilogvec2.ll
+++ test/Transforms/LoopVectorize/epilogvec2.ll
@@ -0,0 +1,82 @@
+; RUN: opt < %s -S -enable-epilog-vectorization -debug-only=loop-vectorize -loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; Test to check epilog loop vectorization.
+; CHECK: LV: Found a vectorizable loop (16)
+; CHECK: Epilog vectorization is beneficial with width : 4
+; CHECK-LABEL: @foo(
+
+; CHECK-LABEL: vector.body:
+; CHECK: <16 x i8>
+; CHECK-LABEL: scalar.ph:
+; CHECK-NEXT: %acr.val = phi i1 [ true, %middle.block ], [ false, %for.body4.preheader ], [ false, %min.iters.checked ], [ false, %vector.memcheck ]
+; CHECK-NEXT: %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body4.preheader ], [ 0, %min.iters.checked ], [ 0, %vector.memcheck ]
+; CHECK-NEXT: %rem.itrs = sub i64 %wide.trip.count, %bc.resume.val
+; CHECK-NEXT: %min.iters.check13 = icmp ult i64 %rem.itrs, 4
+; CHECK-NEXT: br i1 %min.iters.check13, label %scalar.ph12, label %min.iters.checked14
+; CHECK-LABEL: alias.result.chk:
+; CHECK-NEXT: br i1 %acr.val, label %vector.ph19, label %scalar.ph12
+; CHECK-LABEL: vector.body10:
+; CHECK: <4 x i8>
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i8* nocapture %dst, i32 %d_st, i8* nocapture readonly %src1, i32 %s1_st, i8* nocapture readonly %src2, i32 %s2_st, i32 %w, i32 %h) local_unnamed_addr #0 {
+entry:
+  %cmp32 = icmp sgt i32 %h, 0
+  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp230 = icmp sgt i32 %w, 0
+  %idx.ext = sext i32 %d_st to i64
+  %idx.ext12 = sext i32 %s1_st to i64
+  %idx.ext14 = sext i32 %s2_st to i64
+  %wide.trip.count = zext i32 %w to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
+  %y.036 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc17, %for.cond.cleanup3 ]
+  %dst.addr.035 = phi i8* [ %dst, %for.cond1.preheader.lr.ph ], [ %add.ptr, %for.cond.cleanup3 ]
+  %src1.addr.034 = phi i8* [ %src1, %for.cond1.preheader.lr.ph ], [ %add.ptr13, %for.cond.cleanup3 ]
+  %src2.addr.033 = phi i8* [ %src2, %for.cond1.preheader.lr.ph ], [ %add.ptr15, %for.cond.cleanup3 ]
+  br i1 %cmp230, label %for.body4.preheader, label %for.cond.cleanup3
+
+for.body4.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body4
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup3
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.cond.cleanup3.loopexit:                       ; preds = %for.body4
+  br label %for.cond.cleanup3
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup3.loopexit, %for.cond1.preheader
+  %add.ptr = getelementptr inbounds i8, i8* %dst.addr.035, i64 %idx.ext
+  %add.ptr13 = getelementptr inbounds i8, i8* %src1.addr.034, i64 %idx.ext12
+  %add.ptr15 = getelementptr inbounds i8, i8* %src2.addr.033, i64 %idx.ext14
+  %inc17 = add nuw nsw i32 %y.036, 1
+  %exitcond37 = icmp eq i32 %inc17, %h
+  br i1 %exitcond37, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4.preheader, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %src1.addr.034, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx6 = getelementptr inbounds i8, i8* %src2.addr.033, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx6, align 1
+  %conv7 = zext i8 %1 to i32
+  %add = add nuw nsw i32 %conv, 1
+  %add8 = add nuw nsw i32 %add, %conv7
+  %shr29 = lshr i32 %add8, 1
+  %conv9 = trunc i32 %shr29 to i8
+  %arrayidx11 = getelementptr inbounds i8, i8* %dst.addr.035, i64 %indvars.iv
+  store i8 %conv9, i8* %arrayidx11, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup3.loopexit, label %for.body4
+}
+