diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -168,9 +168,9 @@
 /// Information about vectorization costs
 struct VectorizationFactor {
   // Vector width with best cost
-  unsigned Width;
+  unsigned Width = 0;
   // Cost of the loop with that width
-  unsigned Cost;
+  unsigned Cost = 0;
 
   // Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() { return {1, 0}; }
@@ -243,8 +243,9 @@
   void setBestPlan(unsigned VF, unsigned UF);
 
   /// Generate the IR code for the body of the vectorized loop according to the
-  /// best selected VPlan.
-  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+  /// best selected VPlan. Returns 'true' if we successfully generated vector
+  /// loop, 'false' otherwise.
+  bool executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
 
   void printPlans(raw_ostream &O) {
     for (const auto &Plan : VPlans)
@@ -285,6 +286,11 @@
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
   void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
+
+  /// Returns 'false' if additional overhead from generated runtime checks (trip
+  /// count, memory dependency and SCEV checks) makes vectorization not
+  /// profitable, 'true' otherwise.
+  bool mayDisregardRTChecksOverhead(InnerLoopVectorizer &ILV);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -362,7 +362,7 @@
 ///   2) Returns expected trip count according to profile data if any.
 ///   3) Returns upper bound estimate if it is known.
 ///   4) Returns None if all of the above failed.
-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
   // Check if exact trip count is known.
   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
     return ExpectedTC;
@@ -927,12 +927,6 @@
   // Vectorization with OptForSize: don't allow epilogues.
   CM_ScalarEpilogueNotAllowedOptSize,
 
-  // A special case of vectorisation with OptForSize: loops with a very small
-  // trip count are considered for vectorization under OptForSize, thereby
-  // making sure the cost of their loop body is dominant, free of runtime
-  // guards and scalar iteration overheads.
-  CM_ScalarEpilogueNotAllowedLowTripLoop,
-
   // Loop hint predicate indicating an epilogue is undesired.
   CM_ScalarEpilogueNotNeededUsePredicate
 };
@@ -965,7 +959,7 @@
 
   /// \return True if runtime checks are required for vectorization, and false
   /// otherwise.
-  bool runtimeChecksRequired();
+  bool runtimeChecksRequired(bool ReportFailure);
 
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -1294,13 +1288,6 @@
   /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
-private:
-  unsigned NumPredStores = 0;
-
-  /// \return An upper bound for the vectorization factor, larger than zero.
-  /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
-
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -1310,16 +1297,23 @@
   /// actually taken place).
   using VectorizationCostTy = std::pair<unsigned, bool>;
 
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+
+private:
+  unsigned NumPredStores = 0;
+
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
+
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
   VectorizationCostTy expectedCost(unsigned VF);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
@@ -1501,6 +1495,11 @@
 
   /// Values to ignore in the cost model when VF > 1.
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+  /// Cached {VF, Cost} for scalar loop (VF==1).
+  VectorizationFactor ScalarVF;
+  /// Cached {VF, Cost} for best expected vectorization mode.
+  VectorizationFactor BestVF;
 };
 
 } // end namespace llvm
@@ -4843,34 +4842,40 @@
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::runtimeChecksRequired() {
+bool LoopVectorizationCostModel::runtimeChecksRequired(bool ReportFailure) {
   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
 
   if (Legal->getRuntimePointerChecking()->Need) {
-    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
-        "runtime pointer checks needed. Enable vectorization of this "
-        "loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime ptr check is required with -Os/-Oz",
+          "runtime pointer checks needed. Enable vectorization of this "
+          "loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
   if (!PSE.getUnionPredicate().getPredicates().empty()) {
-    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
-        "runtime SCEV checks needed. Enable vectorization of this "
-        "loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime SCEV check is required with -Os/-Oz",
+          "runtime SCEV checks needed. Enable vectorization of this "
+          "loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
-        "runtime stride == 1 checks needed. Enable vectorization of "
-        "this loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
+    if (ReportFailure)
+      reportVectorizationFailure(
+          "Runtime stride check is required with -Os/-Oz",
+          "runtime stride == 1 checks needed. Enable vectorization of "
+          "this loop with '#pragma clang loop vectorize(enable)' when "
+          "compiling with -Os/-Oz",
+          "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
 
@@ -4898,27 +4903,32 @@
   }
 
   switch (ScalarEpilogueStatus) {
-  case CM_ScalarEpilogueAllowed:
+  case CM_ScalarEpilogueAllowed: {
+    // Prefer masked vectorization for short trip count loops without
+    // runtime checks. That way legacy behavior is preserved.
+    // TODO: Ideally this decision should be done by cost model.
+    auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+    if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold &&
+        !runtimeChecksRequired(false)) {
+      LLVM_DEBUG(dbgs() << "LV: Prefer masked vectorization for short trip "
+                        << "count loop.\n");
+      break;
+    }
     return computeFeasibleMaxVF(TC);
+  }
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
-  case CM_ScalarEpilogueNotAllowedLowTripLoop:
-    // fallthrough as a special case of OptForSize
   case CM_ScalarEpilogueNotAllowedOptSize:
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
       LLVM_DEBUG(
           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
-    else
-      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
-                        << "count.\n");
 
     // Bail if runtime checks are required, which are not good when optimising
     // for size.
-    if (runtimeChecksRequired())
+    if (runtimeChecksRequired(true))
       return None;
     break;
   }
@@ -4946,6 +4956,14 @@
     return MaxVF;
   }
 
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Masked vectorization is not allowed. Continue with"
+                  "'normal' vectorization using epilogue\n");
+
+    return MaxVF;
+  }
+
   if (TC == 0) {
     reportVectorizationFailure(
         "Unable to calculate the loop count due to complex control flow",
@@ -5041,10 +5059,11 @@
 
 VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
-  float Cost = expectedCost(1).first;
-  const float ScalarCost = Cost;
+  ScalarVF = { 1, expectedCost(1).first };
+  float Cost = ScalarVF.Cost;
   unsigned Width = 1;
-  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarVF.Cost
+                    << ".\n");
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization && MaxVF > 1) {
@@ -5079,15 +5098,16 @@
         "store that is conditionally executed prevents vectorization",
         "ConditionalStore", ORE, TheLoop);
     Width = 1;
-    Cost = ScalarCost;
+    Cost = ScalarVF.Cost;
   }
 
-  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
-             << "LV: Vectorization seems to be not beneficial, "
-             << "but was forced by a user.\n");
+  LLVM_DEBUG(
+      if (ForceVectorization && Width > 1 && Cost >= ScalarVF.Cost) dbgs()
+      << "LV: Vectorization seems to be not beneficial, "
+      << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
-  return Factor;
+  BestVF = { Width, (unsigned)(Width * Cost) };
+  return BestVF;
 }
 
 std::pair<unsigned, unsigned>
@@ -6514,7 +6534,117 @@
   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
 }
 
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+// Helper function to calculate cost of all instructions in the \p C.
+template<typename Container>
+static uint64_t getCostOfBlocks(LoopVectorizationCostModel &CM,
+                                Container &&C) {
+  uint64_t TotalCost = 0;
+  for (BasicBlock *BB : C) {
+    for (Instruction &I : *BB) {
+      auto InstCost = CM.getInstructionCost(&I, 1).first;
+      TotalCost += InstCost;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << InstCost
+                        << " for VF " << 1 << " For instruction: " << I
+                        << '\n');
+    }
+  }
+  return TotalCost;
+}
+
+// Returns 'false' if additional overhead from generated runtime checks (trip
+// count, memory dependency and SCEV checks) makes vectorization not profitable,
+// 'true' otherwise.
+bool LoopVectorizationPlanner::mayDisregardRTChecksOverhead(
+    InnerLoopVectorizer &ILV) {
+  Optional<unsigned> ExpectedTC =
+      getSmallBestKnownTC(*CM.PSE.getSE(), OrigLoop);
+
+  // No need to check for RT overhead for loops expected not to have short
+  // trip count.
+  // TODO: This is done this was to preserve legacy behavior. We should change
+  // that eventually and be checking for RT overhead for all loops regardless of
+  // TC.
+  if (!ExpectedTC || *ExpectedTC >= TinyTripCountVectorThreshold) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Disregarding run-time checks overhead: not short trip "
+                  "count loop.\n");
+    return true;
+  }
+
+  // No need to check for RT overhead if vectorization was forced. Note that
+  // cost modeling still may be performed to select best VF.
+  if (CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) {
+    LLVM_DEBUG(dbgs() << "LV: Disregarding run-time checks overhead: "
+                         "vectorization was forced.\n");
+    return true;
+  }
+
+  // No need to check for RT overhead if cost modeling was skipped and VF
+  // selected by the user.
+  if (CM.ScalarVF.Width == 0) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Disregarding run-time checks overhead: VF was forced.\n");
+    return true;
+  }
+
+  assert(CM.BestVF.Width != 0 && CM.BestVF.Width > 1 &&
+         "Best VF was not properly selected?");
+
+  LLVM_DEBUG(dbgs() << "LV: Checking cost of run-time overhead for short "
+                       "trip count loop.\n");
+
+  uint64_t VecTripCount = *ExpectedTC / CM.BestVF.Width;
+  uint64_t EpilogTripCount = *ExpectedTC % CM.BestVF.Width;
+
+  uint64_t VecRTCost = getCostOfBlocks(CM, ILV.LoopBypassBlocks) +
+                       getCostOfBlocks<std::initializer_list<BasicBlock *> >(
+                           CM, { ILV.LoopVectorPreHeader, ILV.LoopMiddleBlock,
+                                 ILV.LoopScalarPreHeader });
+  uint64_t VecCost = CM.BestVF.Cost * VecTripCount;
+  uint64_t EpilogCost = CM.ScalarVF.Cost * EpilogTripCount;
+  uint64_t VecTotalCost = VecRTCost + VecCost + EpilogCost;
+  uint64_t ScalarTotalCost = CM.ScalarVF.Cost * (*ExpectedTC);
+
+  LLVM_DEBUG(dbgs() << "LV: ScalarTotalCost = " << ScalarTotalCost << "\n");
+  LLVM_DEBUG(dbgs() << "LV: VecTotalCost = RTCost + (VecCost * VecTC) + "
+                       "(EpilogCost * EpilogTC) = " << VecRTCost << " + ("
+                    << CM.BestVF.Cost << " * " << VecTripCount << ") + ("
+                    << CM.ScalarVF.Cost << " * " << EpilogTripCount
+                    << ") = " << VecTotalCost << "\n");
+
+  if (VecTotalCost >= ScalarTotalCost) {
+    assert(isa<BranchInst>(ILV.LoopBypassBlocks.front()->getTerminator()) &&
+           "RT check should end with branch instruction.");
+
+    // Make vectorized loop effectively dead. Later optimizations should clean
+    // it up.
+    auto *BrInst =
+        cast<BranchInst>(ILV.LoopBypassBlocks.front()->getTerminator());
+    BrInst->setCondition(
+        ConstantInt::getTrue(BrInst->getCondition()->getType()));
+
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: It's not profitable to vectorize short trip count loop.\n");
+
+    ILV.ORE->emit([&]() {
+      return OptimizationRemark(LV_NAME, "Not Vectorized",
+                                ILV.OrigLoop->getStartLoc(),
+                                ILV.OrigLoop->getHeader())
+             << "not profitable to vectorize short trip count loop.";
+    });
+
+    return false;
+  }
+
+  LLVM_DEBUG(
+      dbgs()
+      << "LV: It's still profitable to vectorize short trip count loop.\n");
+  return true;
+}
+
+bool LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                                            DominatorTree *DT) {
   // Perform the actual loop transformation.
 
@@ -6535,13 +6665,20 @@
   //
   //===------------------------------------------------===//
 
-  // 2. Copy and widen instructions from the old loop into the new loop.
-  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
-  VPlans.front()->execute(&State);
+  bool IsVectorizationProfitable = mayDisregardRTChecksOverhead(ILV);
+  // Skip generation of vector body if vectorization turned out to be not
+  // profitable (vector loop is dead in this case).
+  if (IsVectorizationProfitable) {
+    // 2. Copy and widen instructions from the old loop into the new loop.
+    assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+    VPlans.front()->execute(&State);
+
+    // 3. Fix the vectorized code: take care of header phi's, live-outs,
+    //    predication, updating analyses.
+    ILV.fixVectorizedLoop();
+  }
 
-  // 3. Fix the vectorized code: take care of header phi's, live-outs,
-  //    predication, updating analyses.
-  ILV.fixVectorizedLoop();
+  return IsVectorizationProfitable;
 }
 
 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
@@ -7596,21 +7733,6 @@
 
   assert(L->empty() && "Inner loop expected.");
 
-  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
-  // count by optimizing for size, to minimize overheads.
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
-    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                      << "This loop is worth vectorizing only if no scalar "
-                      << "iteration overheads are incurred.");
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-    else {
-      LLVM_DEBUG(dbgs() << "\n");
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
-    }
-  }
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
@@ -7787,23 +7909,24 @@
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                            &LVL, &CM);
-    LVP.executePlan(LB, DT);
-    ++LoopsVectorized;
-
-    // Add metadata to disable runtime unrolling a scalar loop when there are
-    // no runtime checks about strides and memory. A scalar loop that is
-    // rarely used is not worth unrolling.
-    if (!LB.areSafetyChecksAdded())
-      DisableRuntimeUnroll = true;
-
-    // Report the vectorization decision.
-    ORE->emit([&]() {
-      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
-                                L->getHeader())
-             << "vectorized loop (vectorization width: "
-             << NV("VectorizationFactor", VF.Width)
-             << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
-    });
+    if (LVP.executePlan(LB, DT)) {
+      ++LoopsVectorized;
+
+      // Add metadata to disable runtime unrolling a scalar loop when there are
+      // no runtime checks about strides and memory. A scalar loop that is
+      // rarely used is not worth unrolling.
+      if (!LB.areSafetyChecksAdded())
+        DisableRuntimeUnroll = true;
+
+      // Report the vectorization decision.
+      ORE->emit([&]() {
+        return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+                                  L->getHeader())
+               << "vectorized loop (vectorization width: "
+               << NV("VectorizationFactor", VF.Width)
+               << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+      });
+    }
   }
 
   Optional<MDNode *> RemainderLoopID =
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -169,12 +169,12 @@
 ; trip count leads to opt-for-size (which otherwise could fold the tail by
 ; masking).
 ; CHECK-LABEL: @main
-; CHECK-NOT: vector.scevcheck
-; CHECK-NOT: vector.body:
+; CHECK: tc.check
+; CHECK: br i1 true, label %scalar.ph, label %vector.scevcheck
 ; CHECK-LABEL: for.cond:
 ; AUTOVF-LABEL: @main
-; AUTOVF-NOT: vector.scevcheck
-; AUTOVF-NOT: vector.body:
+; AUTOVF: tc.check
+; AUTOVF: br i1 true, label %scalar.ph, label %vector.scevcheck
 ; AUTOVF-LABEL: for.cond:
 define i32 @main() local_unnamed_addr {
 while.cond:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -18,43 +18,41 @@
 ;
 define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
 ; CHECK-LABEL: @vectorized(
-; CHECK-NEXT:  tc.check:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
+; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !0
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]), !llvm.access.group !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -1,14 +1,16 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -debug-only=loop-vectorize < %s  2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; PR39417
-; Check that the need for overflow check prevents vectorizing a loop with tiny
-; trip count (which implies opt for size).
-; CHECK-LABEL: @func_34
-; CHECK-NOT: vector.scevcheck
-; CHECK-NOT: vector.body:
-; CHECK-LABEL: bb67:
+; Check that the need for overflow check makes vectorization of a loop with tiny
+; trip count not profitable. 
+; CHECK: LV: ScalarTotalCost = 21
+; CHECK-NEXT: LV: VecTotalCost = RTCost + (VecCost * VecTC) + (EpilogCost * EpilogTC) = 17 + (5 * 0) + (7 * 3) = 38
+; CHECK-NEXT: LV: It's not profitable to vectorize short trip count loop.
+
+; CHECK-LABEL: @func_34(
+; CHECK-NEXT:  tc.check:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 define void @func_34() {
 bb1:
   br label %bb67
diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -38,11 +38,35 @@
 ; instead.
 define i64 @test1(i64 %y) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  tc.check:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i64> <i64 3, i64 3, i64 3, i64 3>, [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> <i64 77, i64 77, i64 77, i64 77>, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 3, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    [[DIV:%.*]] = xor i64 3, [[Y]]
@@ -51,9 +75,9 @@
 ; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
@@ -83,11 +107,34 @@
 ; instead.
 define i64 @test2(i64 %y) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  tc.check:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> <i64 77, i64 77, i64 77, i64 77>, <4 x i64> <i64 55, i64 55, i64 55, i64 55>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 3, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    br label [[COND_END]]
@@ -95,9 +142,9 @@
 ; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
@@ -126,11 +173,32 @@
 ; instead.
 define i32 @test3(i64 %y) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  tc.check:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 55, i32 55, i32 55, i32 55>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 3, 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    br label [[COND_END]]
@@ -138,9 +206,9 @@
 ; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ]
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/short_tc_rt_checks.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -S -debug-only=loop-vectorize < %s 2>&1 |  FileCheck %s
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is not constant and its value is estimated by profile.
+
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [5 x i32] zeroinitializer, align 16
+@b = dso_local global [5 x i32] zeroinitializer, align 16
+
+; CHECK: LV: Found trip count: 0
+; CHECK: LV: Checking cost of run-time overhead for short trip count loop.
+; CHECK: LV: It's still profitable to vectorize short trip count loop.
+;
+; CHECK: LV: Found trip count: 5
+; CHECK: LV: Checking cost of run-time overhead for short trip count loop.
+; CHECK: LV: It's still profitable to vectorize short trip count loop.
+;
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {
+; CHECK-LABEL: @_Z3fooi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [5 x i32], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [5 x i32]* [[A]] to i8*
+; CHECK-NEXT:    [[B:%.*]] = alloca [5 x i32], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [5 x i32]* [[B]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [5 x i32]* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [5 x i32]* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    br label [[FOR_BODY_US_PREHEADER:%.*]]
+; CHECK:       for.body.us.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK:       for.body.us:
+; CHECK-NEXT:    [[J_019_US:%.*]] = phi i32 [ [[INC8_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    call void @_Z3barPi(i32* nonnull [[ARRAYDECAY]])
+; CHECK-NEXT:    br label [[TC_CHECK:%.*]]
+; CHECK:       tc.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[TMP0]], [[SCEVGEP23]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[B]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[A]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX6_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP20]], [[MUL_US]]
+; CHECK-NEXT:    store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]], align 4, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !prof !14, !llvm.loop !15
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INC8_US]] = add nuw nsw i32 [[J_019_US]], 1
+; CHECK-NEXT:    [[EXITCOND21:%.*]] = icmp eq i32 [[INC8_US]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND21]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]], !prof !16
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond.cleanup.loopexit24:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is known constant value.
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 {
+; CHECK-LABEL: @_Z3fooi2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[J_018:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC8:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+; CHECK-NEXT:    br label [[TC_CHECK:%.*]]
+; CHECK:       tc.check:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 and (i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i64 1, i64 0)), i1 icmp ult (i32* getelementptr inbounds ([5 x i32], [5 x i32]* @b, i32 0, i32 0), i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 1, i64 0))), true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa !3, !alias.scope !17
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa !3, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !tbaa !3, !alias.scope !20, !noalias !17
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 5, 4
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[INC8]] = add nuw nsw i32 [[J_018]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC8]], 1000
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !prof !23
+; CHECK:       for.body4:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY4]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_017]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !3
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], [[I_017]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !3
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[MUL]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX6]], align 4, !tbaa !3
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[INC]], 5
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP3]], !llvm.loop !24
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.body:                                         ; preds = %entry, %for.cond.cleanup3
+  %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+  tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %inc8 = add nuw nsw i32 %j.018, 1
+  %cmp = icmp ult i32 %inc8, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
+  %idxprom = zext i32 %i.017 to i64
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul = mul nsw i32 %0, %i.017
+  %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2
+  %add = add nsw i32 %1, %mul
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !2
+  %inc = add nuw nsw i32 %i.017, 1
+  %cmp2 = icmp ult i32 %inc, 5
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+declare dso_local void @_Z3barPi(i32*) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.isvectorized", i32 1}
+!8 = distinct !{!8, !9, !7}
+!9 = !{!"llvm.loop.unroll.runtime.disable"}
+!10 = !{!"branch_weights", i32 999, i32 4995}
+!11 = !{!"function_entry_count", i64 1}
+!12 = !{!"branch_weights", i32 1, i32 999}
+!13 = !{!"branch_weights", i32 1000, i32 1}
+