diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -168,9 +168,9 @@
 /// Information about vectorization costs
 struct VectorizationFactor {
   // Vector width with best cost
-  unsigned Width;
+  unsigned Width = 0;
   // Cost of the loop with that width
-  unsigned Cost;
+  unsigned Cost = 0;
 
   // Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() { return {1, 0}; }
@@ -243,8 +243,9 @@
   void setBestPlan(unsigned VF, unsigned UF);
 
   /// Generate the IR code for the body of the vectorized loop according to the
-  /// best selected VPlan.
-  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+  /// best selected VPlan. Returns 'true' if we successfully generated vector
+  /// loop, 'false' otherwise.
+  bool executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
 
   void printPlans(raw_ostream &O) {
     for (const auto &Plan : VPlans)
@@ -285,6 +286,11 @@
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
   void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+
+  /// Returns 'false' if additional overhead from generated runtime checks (trip
+  /// count, memory dependency and SCEV checks) makes vectorization not
+  /// profitable, 'true' otherwise.
+  bool mayDisregardRTChecksOverhead(InnerLoopVectorizer &ILV);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -294,6 +294,13 @@
     "vectorize-loops", cl::init(true), cl::Hidden,
     cl::desc("Run the Loop vectorization passes"));
 
+static cl::opt<bool> EnableTinyLoopVectorization(
+    "vectorize-tiny-loops-with-epilog", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of tiny loops even if run-time and/or "
+             "scalar iterations overhead are incuired. See "
+             "'vectorizer-min-trip-count' for more information on tiny "
+             "loops."));
+
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -362,7 +369,7 @@
 ///   2) Returns expected trip count according to profile data if any.
 ///   3) Returns upper bound estimate if it is known.
 ///   4) Returns None if all of the above failed.
-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
   // Check if exact trip count is known.
   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
     return ExpectedTC;
@@ -935,12 +942,6 @@
   // Vectorization with OptForSize: don't allow epilogues.
   CM_ScalarEpilogueNotAllowedOptSize,
 
-  // A special case of vectorisation with OptForSize: loops with a very small
-  // trip count are considered for vectorization under OptForSize, thereby
-  // making sure the cost of their loop body is dominant, free of runtime
-  // guards and scalar iteration overheads.
-  CM_ScalarEpilogueNotAllowedLowTripLoop,
-
   // Loop hint predicate indicating an epilogue is undesired.
   CM_ScalarEpilogueNotNeededUsePredicate
 };
@@ -973,7 +974,7 @@
 
   /// \return True if runtime checks are required for vectorization, and false
   /// otherwise.
-  bool runtimeChecksRequired();
+  bool runtimeChecksRequired(bool ReportFailure);
 
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -1304,13 +1305,6 @@
   /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
-private:
-  unsigned NumPredStores = 0;
-
-  /// \return An upper bound for the vectorization factor, larger than zero.
-  /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
-
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -1320,16 +1314,23 @@
   /// actually taken place).
   using VectorizationCostTy = std::pair<unsigned, bool>;
 
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+private:
+  unsigned NumPredStores = 0;
+
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
   VectorizationCostTy expectedCost(unsigned VF);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
@@ -1511,6 +1512,11 @@
 
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+  /// Cached {VF, Cost} for scalar loop (VF==1).
+  VectorizationFactor ScalarVF;
+  /// Cached {VF, Cost} for best expected vectorization mode.
+  VectorizationFactor BestVF;
 };
 
 } // end namespace llvm
@@ -4890,34 +4896,40 @@
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::runtimeChecksRequired() {
+bool LoopVectorizationCostModel::runtimeChecksRequired(bool ReportFailure) {
   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
 
   if (Legal->getRuntimePointerChecking()->Need) {
-    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
-        "runtime pointer checks needed. Enable vectorization of this "
-        "loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime ptr check is required with -Os/-Oz",
+          "runtime pointer checks needed. Enable vectorization of this "
+          "loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
   if (!PSE.getUnionPredicate().getPredicates().empty()) {
-    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
-        "runtime SCEV checks needed. Enable vectorization of this "
-        "loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime SCEV check is required with -Os/-Oz",
+          "runtime SCEV checks needed. Enable vectorization of this "
+          "loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
-        "runtime stride == 1 checks needed. Enable vectorization of "
-        "this loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime stride check is required with -Os/-Oz",
+          "runtime stride == 1 checks needed. Enable vectorization of "
+          "this loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
@@ -4945,27 +4957,53 @@
   }
 
   switch (ScalarEpilogueStatus) {
-  case CM_ScalarEpilogueAllowed:
+  case CM_ScalarEpilogueAllowed: {
+    auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+    // Tiny loops are handled in a special way.
+    if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+        if (EnableTinyLoopVectorization) {
+          // For tiny loops without runtime checks prefer masked vectorization
+          // to preserve legacy behavior.
+          // TODO: Ideally this decision should be done by cost model.
+          if (Hints->getForce() != LoopVectorizeHints::FK_Enabled &&
+              !runtimeChecksRequired(false)) {
+            LLVM_DEBUG(
+                dbgs() << "LV: Prefer masked vectorization for short trip "
+                       << "count loop.\n");
+            break;
+          }
+        } else {
+          LLVM_DEBUG(
+              dbgs() << "LV: Found a loop with a very small trip count. "
+                     << "This loop is worth vectorizing only if no scalar "
+                     << "iteration overheads are incurred.");
+          if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+            LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+          else {
+            LLVM_DEBUG(dbgs() << "\n");
+            ScalarEpilogueStatus = CM_ScalarEpilogueNotAllowedOptSize;
+            // Legacy behavior is to disable vectorization for tiny loops.
+            if (runtimeChecksRequired(true))
+              return None;
+            break;
+          }
+        }
+    }
     return computeFeasibleMaxVF(TC);
+  }
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
-  case CM_ScalarEpilogueNotAllowedLowTripLoop:
-    // fallthrough as a special case of OptForSize
   case CM_ScalarEpilogueNotAllowedOptSize:
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
       LLVM_DEBUG(
           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
-    else
-      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
-                        << "count.\n");
 
     // Bail if runtime checks are required, which are not good when optimising
     // for size.
-    if (runtimeChecksRequired())
+    if (runtimeChecksRequired(true))
       return None;
     break;
   }
@@ -4989,10 +5027,22 @@
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
+    // Synchronize 'ScalarEpilogueStatus' with folding mode if required.
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed)
+      ScalarEpilogueStatus = CM_ScalarEpilogueNotAllowedOptSize;
     FoldTailByMasking = true;
     return MaxVF;
   }
 
+  // If scalar epilogue was not forbidden proceed with 'normal' vectorization.
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Masked vectorization is not allowed. Continue with"
+                  "'normal' vectorization using epilogue\n");
+
+    return MaxVF;
+  }
+
   if (TC == 0) {
     reportVectorizationFailure(
         "Unable to calculate the loop count due to complex control flow",
@@ -5088,10 +5138,11 @@
 
 VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
-  float Cost = expectedCost(1).first;
-  const float ScalarCost = Cost;
+  ScalarVF = { 1, expectedCost(1).first };
+  float Cost = ScalarVF.Cost;
   unsigned Width = 1;
-  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarVF.Cost
+                    << ".\n");
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization && MaxVF > 1) {
@@ -5126,15 +5177,16 @@
         "store that is conditionally executed prevents vectorization",
         "ConditionalStore", ORE, TheLoop);
     Width = 1;
-    Cost = ScalarCost;
+    Cost = ScalarVF.Cost;
   }
 
-  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
-             << "LV: Vectorization seems to be not beneficial, "
-             << "but was forced by a user.\n");
+  LLVM_DEBUG(
+      if (ForceVectorization && Width > 1 && Cost >= ScalarVF.Cost) dbgs()
+      << "LV: Vectorization seems to be not beneficial, "
+      << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
-  return Factor;
+  BestVF = { Width, (unsigned)(Width * Cost) };
+  return BestVF;
 }
 
 std::pair<unsigned, unsigned>
@@ -6561,7 +6613,120 @@
   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
 }
 
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+// Helper function to calculate cost of all instructions in the \p C.
+template<typename Container>
+static uint64_t getCostOfBlocks(LoopVectorizationCostModel &CM,
+                                Container &&C) {
+  uint64_t TotalCost = 0;
+  for (BasicBlock *BB : C) {
+    for (Instruction &I : *BB) {
+      auto InstCost = CM.getInstructionCost(&I, 1).first;
+      TotalCost += InstCost;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << InstCost
+                        << " for VF " << 1 << " For instruction: " << I
+                        << '\n');
+    }
+  }
+  return TotalCost;
+}
+
+// Returns 'false' if additional overhead from generated runtime checks (trip
+// count, memory dependency and SCEV checks) makes vectorization not profitable,
+// 'true' otherwise.
+bool LoopVectorizationPlanner::mayDisregardRTChecksOverhead(
+    InnerLoopVectorizer &ILV) {
+  Optional<unsigned> ExpectedTC =
+      getSmallBestKnownTC(*CM.PSE.getSE(), OrigLoop);
+
+  // No need to check for RT overhead for loops expected not to have short
+  // trip count.
+  // TODO: This is done this was to preserve legacy behavior. We should change
+  // that eventually and be checking for RT overhead for all loops regardless of
+  // TC.
+  if (!ExpectedTC || *ExpectedTC >= TinyTripCountVectorThreshold) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Disregarding run-time checks overhead: not short trip "
+                  "count loop.\n");
+    return true;
+  }
+
+  // No need to check for RT overhead if vectorization was forced. Note that
+  // cost modeling still may be performed to select best VF.
+  if (CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) {
+    LLVM_DEBUG(dbgs() << "LV: Disregarding run-time checks overhead: "
+                         "vectorization was forced.\n");
+    return true;
+  }
+
+  // No need to check for RT overhead if cost modeling was skipped and VF
+  // selected by the user.
+  if (CM.ScalarVF.Width == 0) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Disregarding run-time checks overhead: VF was forced.\n");
+    return true;
+  }
+
+  assert(CM.BestVF.Width != 0 && CM.BestVF.Width > 1 &&
+         "Best VF was not properly selected?");
+
+  LLVM_DEBUG(dbgs() << "LV: Checking cost of run-time overhead for short "
+                       "trip count loop.\n");
+
+  uint64_t VecTripCount = *ExpectedTC / CM.BestVF.Width;
+  uint64_t RemainderTripCount = *ExpectedTC % CM.BestVF.Width;
+
+  // In "foldTailByMasking" mode remainder iterations are executed as part of
+  // the main vector loop. That means all remainder iterations will be executed
+  // as one masked vector iterations.
+  if (RemainderTripCount != 0 && CM.foldTailByMasking()) {
+    ++VecTripCount;
+    RemainderTripCount = 0;
+  }
+
+  // In "requiresScalarEpilogue" mode there should be at least one iteration
+  // executed in the remainder loop. If all iterations end up being executed
+  // as part of the main vector loop forward one vector iteration to remainder
+  // loop.
+  if (RemainderTripCount == 0 && CM.requiresScalarEpilogue()) {
+    --VecTripCount;
+    RemainderTripCount = CM.BestVF.Width;
+  }
+
+  uint64_t VecRTCost = getCostOfBlocks(CM, ILV.LoopBypassBlocks) +
+                       getCostOfBlocks<std::initializer_list<BasicBlock *> >(
+                           CM, { ILV.LoopVectorPreHeader, ILV.LoopMiddleBlock,
+                                 ILV.LoopScalarPreHeader });
+  uint64_t VecCost = CM.BestVF.Cost * VecTripCount;
+  uint64_t RemainderCost = CM.ScalarVF.Cost * RemainderTripCount;
+  uint64_t VecTotalCost = VecRTCost + VecCost + RemainderCost;
+  uint64_t ScalarTotalCost = CM.ScalarVF.Cost * (*ExpectedTC);
+
+  LLVM_DEBUG(dbgs() << "LV: ScalarTotalCost = " << ScalarTotalCost << "\n");
+  LLVM_DEBUG(dbgs() << "LV: VecTotalCost = RTCost + (VecCost * VecTC) + "
+                       "(RemainderCost * RemainderTC) = " << VecRTCost << " + ("
+                    << CM.BestVF.Cost << " * " << VecTripCount << ") + ("
+                    << CM.ScalarVF.Cost << " * " << RemainderTripCount
+                    << ") = " << VecTotalCost << "\n");
+
+  if (VecTotalCost >= ScalarTotalCost) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: It's not profitable to vectorize short trip count loop.\n");
+
+    assert(isa<BranchInst>(ILV.LoopBypassBlocks.front()->getTerminator()) &&
+           "RT check should end with branch instruction.");
+
+    return false;
+  }
+
+  LLVM_DEBUG(
+      dbgs()
+      << "LV: It's still profitable to vectorize short trip count loop.\n");
+  return true;
+}
+
+bool LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                                            DominatorTree *DT) {
   // Perform the actual loop transformation.
 
@@ -6582,13 +6747,35 @@
   //
   //===------------------------------------------------===//
 
-  // 2. Copy and widen instructions from the old loop into the new loop.
-  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
-  VPlans.front()->execute(&State);
+  bool IsVectorizationProfitable = mayDisregardRTChecksOverhead(ILV);
+  // Skip generation of vector body if vectorization turned out to be not
+  // profitable (vector loop is dead in this case).
+  if (IsVectorizationProfitable) {
+    // 2. Copy and widen instructions from the old loop into the new loop.
+    assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+    VPlans.front()->execute(&State);
+
+    // 3. Fix the vectorized code: take care of header phi's, live-outs,
+    //    predication, updating analyses.
+    ILV.fixVectorizedLoop();
+  } else {
+    // Make vectorized loop effectively dead. Later optimizations should clean
+    // it up.
+    auto *BrInst =
+        cast<BranchInst>(ILV.LoopBypassBlocks.front()->getTerminator());
+    BrInst->setCondition(
+        ConstantInt::getTrue(BrInst->getCondition()->getType()));
+
+
+    ILV.ORE->emit([&]() {
+      return OptimizationRemark(LV_NAME, "Not Vectorized",
+                                ILV.OrigLoop->getStartLoc(),
+                                ILV.OrigLoop->getHeader())
+             << "not profitable to vectorize short trip count loop.";
+    });
+  }
 
-  // 3. Fix the vectorized code: take care of header phi's, live-outs,
-  //    predication, updating analyses.
-  ILV.fixVectorizedLoop();
+  return IsVectorizationProfitable;
 }
 
 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
@@ -7584,7 +7771,9 @@
                          &CM);
   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                     << L->getHeader()->getParent()->getName() << "\"\n");
-  LVP.executePlan(LB, DT);
+  bool IsVectorized = LVP.executePlan(LB, DT);
+  assert(IsVectorized && "VPlan failed to be executed in native path.");
+  (void)IsVectorized;
 
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
@@ -7663,21 +7852,6 @@
 
   assert(L->empty() && "Inner loop expected.");
 
-  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
-  // count by optimizing for size, to minimize overheads.
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
-    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                      << "This loop is worth vectorizing only if no scalar "
-                      << "iteration overheads are incurred.");
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-    else {
-      LLVM_DEBUG(dbgs() << "\n");
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
-    }
-  }
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
@@ -7833,59 +8007,61 @@
   LVP.setBestPlan(VF.Width, IC);
 
   using namespace ore;
-  bool DisableRuntimeUnroll = false;
   MDNode *OrigLoopID = L->getLoopID();
+  std::unique_ptr<InnerLoopVectorizer> ILV;
 
   if (!VectorizeLoop) {
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
     // interleave it.
-    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                               &CM);
-    LVP.executePlan(Unroller, DT);
-
-    ORE->emit([&]() {
-      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
-                                L->getHeader())
-             << "interleaved loop (interleaved count: "
-             << NV("InterleaveCount", IC) << ")";
-    });
+    ILV = std::make_unique<InnerLoopUnroller>(L, PSE, LI, DT, TLI, TTI, AC, ORE,
+                                              IC, &LVL, &CM);
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
-    LVP.executePlan(LB, DT);
-    ++LoopsVectorized;
-
-    // Add metadata to disable runtime unrolling a scalar loop when there are
-    // no runtime checks about strides and memory. A scalar loop that is
-    // rarely used is not worth unrolling.
-    if (!LB.areSafetyChecksAdded())
-      DisableRuntimeUnroll = true;
-
-    // Report the vectorization decision.
-    ORE->emit([&]() {
-      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
-                                L->getHeader())
-             << "vectorized loop (vectorization width: "
-             << NV("VectorizationFactor", VF.Width)
-             << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
-    });
+    ILV = std::make_unique<InnerLoopVectorizer>(L, PSE, LI, DT, TLI, TTI, AC,
+                                                ORE, VF.Width, IC, &LVL, &CM);
   }
 
-  Optional<MDNode *> RemainderLoopID =
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                      LLVMLoopVectorizeFollowupEpilogue});
-  if (RemainderLoopID.hasValue()) {
-    L->setLoopID(RemainderLoopID.getValue());
-  } else {
-    if (DisableRuntimeUnroll)
-      AddRuntimeUnrollDisableMetaData(L);
+  bool IsPlanExecuted = LVP.executePlan(*ILV, DT);
 
-    // Mark the loop as already vectorized to avoid vectorizing again.
-    Hints.setAlreadyVectorized();
+  if (IsPlanExecuted) {
+    if (!VectorizeLoop) {
+      ORE->emit([&]() {
+        return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+                                  L->getHeader())
+               << "interleaved loop (interleaved count: "
+               << NV("InterleaveCount", IC) << ")";
+      });
+    } else {
+      ++LoopsVectorized;
+      // Report the vectorization decision.
+      ORE->emit([&]() {
+        return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+                                  L->getHeader())
+               << "vectorized loop (vectorization width: "
+               << NV("VectorizationFactor", VF.Width)
+               << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+      });
+    }
+
+    Optional<MDNode *> RemainderLoopID =
+        makeFollowupLoopID(OrigLoopID, { LLVMLoopVectorizeFollowupAll,
+                                         LLVMLoopVectorizeFollowupEpilogue });
+    if (RemainderLoopID.hasValue()) {
+      L->setLoopID(RemainderLoopID.getValue());
+    } else {
+      // Add metadata to disable runtime unrolling a scalar loop when there are
+      // no runtime checks about strides and memory. A scalar loop that is
+      // rarely used is not worth unrolling.
+      if (VectorizeLoop && !ILV->areSafetyChecksAdded())
+        AddRuntimeUnrollDisableMetaData(L);
+
+      // Mark the loop as already vectorized to avoid vectorizing again.
+      Hints.setAlreadyVectorized();
+    }
   }
 
+  // IR could be changed even if 'IsPlanExecuted' is false.
   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
   return true;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -S -debug-only=loop-vectorize -vectorize-tiny-loops-with-epilog=true < %s 2>&1 |  FileCheck %s
+; REQUIRES asserts
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is not constant and its value is estimated by profile.
+
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [5 x i32] zeroinitializer, align 16
+@b = dso_local global [5 x i32] zeroinitializer, align 16
+
+; CHECK: LV: Found trip count: 0
+; CHECK: LV: Checking cost of run-time overhead for short trip count loop.
+; CHECK: LV: It's still profitable to vectorize short trip count loop.
+;
+; CHECK: LV: Found trip count: 5
+; CHECK: LV: Checking cost of run-time overhead for short trip count loop.
+; CHECK: LV: It's still profitable to vectorize short trip count loop.
+;
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {
+; CHECK-LABEL: @_Z3fooi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [5 x i32], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [5 x i32]* [[A]] to i8*
+; CHECK-NEXT:    [[B:%.*]] = alloca [5 x i32], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [5 x i32]* [[B]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [5 x i32]* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [5 x i32]* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    br label [[FOR_BODY_US_PREHEADER:%.*]]
+; CHECK:       for.body.us.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK:       for.body.us:
+; CHECK-NEXT:    [[J_019_US:%.*]] = phi i32 [ [[INC8_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    call void @_Z3barPi(i32* nonnull [[ARRAYDECAY]])
+; CHECK-NEXT:    br label [[TC_CHECK:%.*]]
+; CHECK:       tc.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[TMP0]], [[SCEVGEP23]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX6_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP20]], [[MUL_US]]
+; CHECK-NEXT:    store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !prof !14, !llvm.loop !15
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INC8_US]] = add nuw nsw i32 [[J_019_US]], 1
+; CHECK-NEXT:    [[EXITCOND21:%.*]] = icmp eq i32 [[INC8_US]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND21]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]], !prof !16
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond.cleanup.loopexit24:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is known constant value.
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 {
+; CHECK-LABEL: @_Z3fooi2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[J_018:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC8:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+; CHECK-NEXT:    br label [[TC_CHECK:%.*]]
+; CHECK:       tc.check:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 and (i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i64 1, i64 0)), i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 1, i64 0))), true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa !3, !alias.scope !17
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa !3, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa !3, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 5, 4
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[INC8]] = add nuw nsw i32 [[J_018]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC8]], 1000
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !prof !23
+; CHECK:       for.body4:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_017]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !3
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], [[I_017]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !3
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[MUL]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX6]], align 4, !tbaa !3
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 5
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP3]], !llvm.loop !24
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.body:                                         ; preds = %entry, %for.cond.cleanup3
+  %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+  tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %inc8 = add nuw nsw i32 %j.018, 1
+  %cmp = icmp ult i32 %inc8, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
+  %idxprom = zext i32 %i.017 to i64
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul = mul nsw i32 %0, %i.017
+  %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2
+  %add = add nsw i32 %1, %mul
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !2
+  %inc = add nuw nsw i32 %i.017, 1
+  %cmp2 = icmp ult i32 %inc, 5
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+declare dso_local void @_Z3barPi(i32*) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.isvectorized", i32 1}
+!8 = distinct !{!8, !9, !7}
+!9 = !{!"llvm.loop.unroll.runtime.disable"}
+!10 = !{!"branch_weights", i32 999, i32 4995}
+!11 = !{!"function_entry_count", i64 1}
+!12 = !{!"branch_weights", i32 1, i32 999}
+!13 = !{!"branch_weights", i32 1000, i32 1}
+