diff --git a/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
--- a/llvm/include/llvm/Transforms/Utils/SizeOpts.h
+++ b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
@@ -22,12 +22,12 @@
 
 /// Returns true if function \p F is suggested to be size-optimized base on the
 /// profile.
-bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
-                           BlockFrequencyInfo *BFI);
+Optional<bool> shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+                                     BlockFrequencyInfo *BFI);
 /// Returns true if basic block \p BB is suggested to be size-optimized base
 /// on the profile.
-bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
-                           BlockFrequencyInfo *BFI);
+Optional<bool> shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
+                                     BlockFrequencyInfo *BFI);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -552,8 +552,9 @@
   unsigned NumUses = 0;
 
   bool OptForSize = Entry->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI);
-  if (!OptForSize || std::distance(S,E) > 100) {
+                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI)
+                        .getValueOr(false);
+  if (!OptForSize || std::distance(S, E) > 100) {
     for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
       NumUses += ConstCand->Uses.size();
       if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -542,8 +542,9 @@
 
       auto *HeaderBB = L->getHeader();
       auto *F = HeaderBB->getParent();
-      bool OptForSize = F->hasOptSize() ||
-                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
+      bool OptForSize =
+          F->hasOptSize() ||
+          llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI).getValueOr(false);
       if (OptForSize) {
         LLVM_DEBUG(
             dbgs() << "Versioning is needed but not allowed when optimizing "
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -210,8 +210,9 @@
   TTI.getUnrollingPreferences(L, SE, UP);
 
   // Apply size attributes
-  bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
+  bool OptForSize =
+      L->getHeader()->getParent()->hasOptSize() ||
+      llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI).getValueOr(false);
   if (OptForSize) {
     UP.Threshold = UP.OptSizeThreshold;
     UP.PartialThreshold = UP.PartialOptSizeThreshold;
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2596,8 +2596,9 @@
 
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
   // requires more arguments and thus extra MOVs are required.
-  bool OptForSize = CI->getFunction()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+  bool OptForSize =
+      CI->getFunction()->hasOptSize() ||
+      llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI).getValueOr(false);
   if (OptForSize)
     return nullptr;
 
diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -20,18 +20,19 @@
     "pgso", cl::Hidden, cl::init(true),
     cl::desc("Enable the profile guided size optimization. "));
 
-bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
+Optional<bool> llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+                                           BlockFrequencyInfo *BFI) {
   assert(F);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
+  if (!ProfileGuidedSizeOpt || !PSI || !BFI || !PSI->hasProfileSummary())
+    return None;
+  return PSI->isFunctionColdInCallGraph(F, *BFI);
 }
 
-bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
+Optional<bool> llvm::shouldOptimizeForSize(BasicBlock *BB,
+                                           ProfileSummaryInfo *PSI,
+                                           BlockFrequencyInfo *BFI) {
   assert(BB);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
+  if (!ProfileGuidedSizeOpt || !PSI || !BFI || !PSI->hasProfileSummary())
+    return None;
+  return PSI->isColdBlock(BB, BFI);
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -293,6 +293,15 @@
     "vectorize-loops", cl::init(true), cl::Hidden,
     cl::desc("Run the Loop vectorization passes"));
 
+static cl::opt<unsigned> LocalHotnessThreshold(
+    "local-hotness-threshold", cl::init(500), cl::Hidden,
+    cl::desc(
+        "In cases when there is no info on block hotness available from module "
+        "profile we define \"local hotness\" as a ratio of the block to "
+        "function entry execution counts. If the ration is greater than the "
+        "threshold defined by this parameter the block is said to be locally "
+        "hot."));
+
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -930,7 +939,7 @@
   // Vectorization with OptForSize: don't allow epilogues.
   CM_ScalarEpilogueNotAllowedOptSize,
 
-  // A special case of vectorisation with OptForSize: loops with a very small
+  // A special case of vectorization with OptForSize: loops with a very small
   // trip count are considered for vectorization under OptForSize, thereby
   // making sure the cost of their loop body is dominant, free of runtime
   // guards and scalar iteration overheads.
@@ -7365,15 +7374,48 @@
 
 static ScalarEpilogueLowering
 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
-                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                          ScalarEvolution &SE) {
   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+  auto IsColdByProfile = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+      (F->hasOptSize() || IsColdByProfile.getValueOr(false)))
     SEL = CM_ScalarEpilogueNotAllowedOptSize;
   else if (PreferPredicateOverEpilog || Hints.getPredicate()) 
     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+  else {
+    auto ExpectedTC = getSmallBestKnownTC(SE, L);
+    // Check the loop for a trip count threshold: vectorize loops with a tiny
+    // trip count by optimizing for size, to minimize overheads.
+    if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+      // Even short trip count loops may be hot (part of hot region).
+      // In absence of profile summary estimate loop hotness relative to
+      // function entry using execution frequency information.
+      if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) {
+        Optional<uint64_t> LoopCount =
+            BFI->getBlockProfileCount(L->getHeader(), true);
+        Optional<uint64_t> FunctionCount =
+            BFI->getBlockProfileCount(&F->getEntryBlock(), true);
+        if (LoopCount && FunctionCount &&
+            (*LoopCount > *FunctionCount * LocalHotnessThreshold)) {
+          LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to "
+                               "hotness considerations.");
+          return CM_ScalarEpilogueAllowed;
+        }
+      }
+
+      LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                        << "This loop is worth vectorizing only if no scalar "
+                        << "iteration overheads are incurred.");
 
+      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+        LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+      else {
+        LLVM_DEBUG(dbgs() << "\n");
+        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+      }
+    }
+  }
   return SEL;
 }
 
@@ -7391,7 +7433,8 @@
   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
-  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+  ScalarEpilogueLowering SEL =
+      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, *PSE.getSE());
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
@@ -7483,7 +7526,8 @@
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+  ScalarEpilogueLowering SEL =
+      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, *SE);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
@@ -7496,21 +7540,6 @@
 
   assert(L->empty() && "Inner loop expected.");
 
-  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
-  // count by optimizing for size, to minimize overheads.
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
-    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                      << "This loop is worth vectorizing only if no scalar "
-                      << "iteration overheads are incurred.");
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-    else {
-      LLVM_DEBUG(dbgs() << "\n");
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
-    }
-  }
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
diff --git a/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -S < %s 2>&1 |  FileCheck %s
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is not constant and its value is estimated by profile.
+
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [5 x i32] zeroinitializer, align 16
+@b = dso_local global [5 x i32] zeroinitializer, align 16
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {
+; CHECK:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]]
+; CHECK:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]]
+; CHECK:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]]
+; CHECK:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
+; CHECK:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is known constant value.
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !14 {
+; CHECK:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]]
+; CHECK:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]]
+; CHECK:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]]
+; CHECK:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
+; CHECK:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.body:                                         ; preds = %entry, %for.cond.cleanup3
+  %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+  tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %inc8 = add nuw nsw i32 %j.018, 1
+  %cmp = icmp ult i32 %inc8, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
+  %idxprom = zext i32 %i.017 to i64
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul = mul nsw i32 %0, %i.017
+  %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2
+  %add = add nsw i32 %1, %mul
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !2
+  %inc = add nuw nsw i32 %i.017, 1
+  %cmp2 = icmp ult i32 %inc, 5
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+}
+
+; This is negative test. Check that vectorization is not performed for COLD
+; short trip count loop requiring epilog. Note that outer loop has only 20
+; iterations and there is no associated profile info.
+
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !16 {
+; CHECK:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US:%.*]]
+; CHECK:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[TMP3:%.*]]
+; CHECK:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX6_US:%.*]]
+; CHECK:    [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]]
+; CHECK:    store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]]
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !15
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+declare dso_local void @_Z3barPi(i32*) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.isvectorized", i32 1}
+!8 = distinct !{!8, !9, !7}
+!9 = !{!"llvm.loop.unroll.runtime.disable"}
+!10 = !{!"branch_weights", i32 999, i32 4995}
+!12 = !{!"branch_weights", i32 1, i32 999}
+!11 = !{!"function_entry_count", i64 1}
+!13 = !{!"branch_weights", i32 1000, i32 1}
+!14 = !{!"function_entry_count", i64 1}
+!15 = !{!"branch_weights", i32 9, i32 45}
+!16 = !{!"function_entry_count", i64 1}