diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -294,6 +294,14 @@
     "vectorize-loops", cl::init(true), cl::Hidden,
     cl::desc("Run the Loop vectorization passes"));
 
+static cl::opt<unsigned> LocalHotnessThreshold(
+    "local-hotness-threshold", cl::init(500), cl::Hidden,
+    cl::desc(
+        "In cases when there is no info on block hotness available from module "
+        "profile we define \"local hotness\" as a ratio of the block to "
+        "function entry execution counts. If the ration is greater than the "
+        "threshold defined by this parameter the block is said to be locally "
+        "hot."));
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -7439,6 +7447,39 @@
            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+  else {
+    auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+    // Check the loop for a trip count threshold: vectorize loops with a tiny
+    // trip count by optimizing for size, to minimize overheads.
+    if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+      // Even short trip count loops may be hot (part of hot region).
+      // In absence of profile summary estimate loop hotness relative to
+      // function entry using execution frequency information.
+      if (!IsColdByProfile && LoopVectorizeWithBlockFrequency && BFI) {
+        Optional<uint64_t> LoopCount =
+            BFI->getBlockProfileCount(L->getHeader(), true);
+        Optional<uint64_t> FunctionCount =
+            BFI->getBlockProfileCount(&F->getEntryBlock(), true);
+        if (LoopCount && FunctionCount &&
+            (*LoopCount > *FunctionCount * LocalHotnessThreshold)) {
+          LLVM_DEBUG(dbgs() << "Allow epilog for short trip count loop due to "
+                               "hotness considerations.");
+          return CM_ScalarEpilogueAllowed;
+        }
+      }
+
+      LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                        << "This loop is worth vectorizing only if no scalar "
+                        << "iteration overheads are incurred.");
+
+      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+        LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+      else {
+        LLVM_DEBUG(dbgs() << "\n");
+        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+      }
+    }
+  }
 
   return SEL;
 }
@@ -7567,21 +7608,6 @@
 
   assert(L->empty() && "Inner loop expected.");
 
-  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
-  // count by optimizing for size, to minimize overheads.
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
-    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                      << "This loop is worth vectorizing only if no scalar "
-                      << "iteration overheads are incurred.");
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-    else {
-      LLVM_DEBUG(dbgs() << "\n");
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
-    }
-  }
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
diff --git a/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hot_short_tc_loop.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -S < %s 2>&1 |  FileCheck %s
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is not constant and its value is estimated by profile.
+
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [5 x i32] zeroinitializer, align 16
+@b = dso_local global [5 x i32] zeroinitializer, align 16
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi(i32 %M) local_unnamed_addr #0 !prof !11 {
+; CHECK:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]]
+; CHECK:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]]
+; CHECK:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]]
+; CHECK:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
+; CHECK:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !10
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us, !prof !12
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Check vectorization of hot short trip count with epilog. In this case inner
+; loop trip count is known constant value.
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi2() local_unnamed_addr #0 !prof !11 {
+; CHECK:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15:%.*]]
+; CHECK:    [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND6:%.*]]
+; CHECK:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23:%.*]]
+; CHECK:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP18]]
+; CHECK:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP28:%.*]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.body:                                         ; preds = %entry, %for.cond.cleanup3
+  %j.018 = phi i32 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+  tail call void @_Z3barPi(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @a, i64 0, i64 0))
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %inc8 = add nuw nsw i32 %j.018, 1
+  %cmp = icmp ult i32 %inc8, 1000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !prof !13
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %i.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ]
+  %idxprom = zext i32 %i.017 to i64
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @b, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul = mul nsw i32 %0, %i.017
+  %arrayidx6 = getelementptr inbounds [5 x i32], [5 x i32]* @a, i64 0, i64 %idxprom
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !2
+  %add = add nsw i32 %1, %mul
+  store i32 %add, i32* %arrayidx6, align 4, !tbaa !2
+  %inc = add nuw nsw i32 %i.017, 1
+  %cmp2 = icmp ult i32 %inc, 5
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+}
+
+; This is negative test. Check that vectorization is not performed for COLD
+; short trip count loop requiring epilog. Note that outer loop has only 20
+; iterations and there is no associated profile info.
+
+
+; Function Attrs: uwtable
+define dso_local void @_Z3fooi3(i32 %M) local_unnamed_addr #0 !prof !11 {
+; CHECK:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US:%.*]]
+; CHECK:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[TMP3:%.*]]
+; CHECK:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX6_US:%.*]]
+; CHECK:    [[ADD_US:%.*]] = add nsw i32 [[TMP4]], [[MUL_US]]
+; CHECK:    store i32 [[ADD_US]], i32* [[ARRAYIDX6_US]]
+;
+entry:
+  %a = alloca [5 x i32], align 16
+  %b = alloca [5 x i32], align 16
+  %0 = bitcast [5 x i32]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %0) #3
+  %1 = bitcast [5 x i32]* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %1) #3
+  %arraydecay = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 0
+  br label %for.body.us.preheader
+
+for.body.us.preheader:                            ; preds = %entry
+  %wide.trip.count = zext i32 %M to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.body.us.preheader
+  %j.019.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  call void @_Z3barPi(i32* nonnull %arraydecay)
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.body.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [5 x i32], [5 x i32]* %b, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx.us, align 4, !tbaa !2
+  %3 = trunc i64 %indvars.iv to i32
+  %mul.us = mul nsw i32 %2, %3
+  %arrayidx6.us = getelementptr inbounds [5 x i32], [5 x i32]* %a, i64 0, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx6.us, align 4, !tbaa !2
+  %add.us = add nsw i32 %4, %mul.us
+  store i32 %add.us, i32* %arrayidx6.us, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !prof !14
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc8.us = add nuw nsw i32 %j.019.us, 1
+  %exitcond21 = icmp eq i32 %inc8.us, 20
+  br i1 %exitcond21, label %for.cond.cleanup.loopexit, label %for.body.us
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit24:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit24, %for.cond.cleanup.loopexit
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %0) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+declare dso_local void @_Z3barPi(i32*) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project f379dd57b978c4e1483d721f422c79e3c0c5ccdc)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.isvectorized", i32 1}
+!8 = distinct !{!8, !9, !7}
+!9 = !{!"llvm.loop.unroll.runtime.disable"}
+!10 = !{!"branch_weights", i32 999, i32 4995}
+!11 = !{!"function_entry_count", i64 1}
+!12 = !{!"branch_weights", i32 1, i32 999}
+!13 = !{!"branch_weights", i32 1000, i32 1}
+!14 = !{!"branch_weights", i32 9, i32 45}