Index: include/llvm/Analysis/LoopPassManager.h
===================================================================
--- include/llvm/Analysis/LoopPassManager.h
+++ include/llvm/Analysis/LoopPassManager.h
@@ -48,6 +48,9 @@
 typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>
     FunctionAnalysisManagerLoopProxy;
 
+typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Loop>
+    ModuleAnalysisManagerLoopProxy;
+
 /// Returns the minimum set of Analyses that all loop passes must preserve.
 PreservedAnalyses getLoopPassPreservedAnalyses();
 
Index: lib/Passes/PassBuilder.cpp
===================================================================
--- lib/Passes/PassBuilder.cpp
+++ lib/Passes/PassBuilder.cpp
@@ -774,6 +774,7 @@
   FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
   FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+  LAM.registerPass([&] { return ModuleAnalysisManagerLoopProxy(MAM); });
 }
 
 bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
Index: lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -22,6 +23,7 @@
 #include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -102,6 +104,12 @@
     cl::desc("Unrolled size limit for loops with an unroll(full) or "
              "unroll_count pragma."));
 
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+    "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+    cl::desc("If the runtime tripcount for the loop is lower than the "
+             "threshold, the loop is considered as flat and will be less "
+             "aggressively unrolled."));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -712,8 +720,9 @@
 // Calculates unroll count and writes it to UP.Count.
 static bool computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
-    unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+    ScalarEvolution *SE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+    unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
@@ -748,6 +757,19 @@
   bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
                         PragmaEnableUnroll || UserUnrollCount;
 
+  if (L->getHeader()->getParent()->getEntryCount()) {
+    if (TripCount == 0 &&
+        BFI->getBlockFreq(L->getHeader()) <
+            FlatLoopTripCountThreshold *
+                BFI->getBlockFreq(L->getLoopPreheader()).getFrequency()) {
+      UP.PartialThreshold /= 4;
+    } else if (PSI->isHotBB(L->getHeader(), BFI)) {
+      UP.Threshold *= 4;
+      UP.PartialThreshold *= 4;
+      UP.AllowExpensiveTripCount = true;
+    }
+  }
+
   if (ExplicitUnroll && TripCount != 0) {
     // If the loop has an unrolling pragma, we want to be more aggressive with
     // unrolling limits. Set thresholds to at least the PragmaThreshold value
@@ -923,8 +945,9 @@
 
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                             ScalarEvolution *SE, const TargetTransformInfo &TTI,
-                            AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA,
+                            AssumptionCache &AC, BlockFrequencyInfo *BFI,
+                            ProfileSummaryInfo *PSI,
+                            OptimizationRemarkEmitter &ORE, bool PreserveLCSSA,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
@@ -1016,9 +1039,9 @@
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
   bool UseUpperBound = false;
-  bool IsCountSetExplicitly =
-      computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
-                         TripMultiple, LoopSize, UP, UseUpperBound);
+  bool IsCountSetExplicitly = computeUnrollCount(
+      L, TTI, DT, LI, SE, BFI, PSI, &ORE, TripCount, MaxTripCount, TripMultiple,
+      LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return false;
   // Unroll factor (Count) must be less or equal to TripCount.
@@ -1067,6 +1090,10 @@
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    BlockFrequencyInfo *BFI =
+        &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    ProfileSummaryInfo *PSI =
+        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1076,7 +1103,7 @@
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, BFI, PSI, ORE, PreserveLCSSA,
                            ProvidedCount, ProvidedThreshold,
                            ProvidedAllowPartial, ProvidedRuntime,
                            ProvidedUpperBound);
@@ -1087,6 +1114,8 @@
   ///
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     // FIXME: Loop passes are required to preserve domtree, and for now we just
     // recreate dom info if anything gets unrolled.
@@ -1098,6 +1127,8 @@
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
@@ -1123,13 +1154,16 @@
   const auto &FAM =
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
   Function *F = L.getHeader()->getParent();
-
+  const auto &MAM = 
+      AM.getResult<ModuleAnalysisManagerLoopProxy>(L).getManager();
 
   DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
   LoopInfo *LI = FAM.getCachedResult<LoopAnalysis>(*F);
   ScalarEvolution *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
   auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
   auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
+  auto *BFI = FAM.getCachedResult<BlockFrequencyAnalysis>(*F);
+  auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F->getParent());
   auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
   if (!DT)
     report_fatal_error(
@@ -1146,14 +1180,20 @@
   if (!AC)
     report_fatal_error(
         "LoopUnrollPass: AssumptionAnalysis not cached at a higher level");
+  if (!BFI)
+    report_fatal_error(
+        "LoopUnrollPass: BlockFrequencyAnalysis not cached at a higher level");
+  if (!PSI)
+    report_fatal_error(
+        "LoopUnrollPass: ProfileSummaryAnalysis not cached at a higher level");
   if (!ORE)
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed =
-      tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true,
-                      ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
-                      ProvidedRuntime, ProvidedUpperBound);
+  bool Changed = tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, BFI, PSI,
+                                 *ORE, /*PreserveLCSSA*/ true, ProvidedCount,
+                                 ProvidedThreshold, ProvidedAllowPartial,
+                                 ProvidedRuntime, ProvidedUpperBound);
 
   if (!Changed)
     return PreservedAnalyses::all();
Index: test/Other/pass-pipelines.ll
===================================================================
--- test/Other/pass-pipelines.ll
+++ test/Other/pass-pipelines.ll
@@ -46,6 +46,8 @@
 ; CHECK-O2-NOT: Manager
 ; CHECK-O2: Loop Pass Manager
 ; CHECK-O2-NOT: Manager
+; CHECK-O2: Loop Pass Manager
+; CHECK-O2-NEXT: Unroll loops
 ; FIXME: It isn't clear that we need yet another loop pass pipeline
 ; and run of LICM here.
 ; CHECK-O2-NOT: Manager
Index: test/Transforms/LoopUnroll/unloop.ll
===================================================================
--- test/Transforms/LoopUnroll/unloop.ll
+++ test/Transforms/LoopUnroll/unloop.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -loop-unroll -verify-loop-info | FileCheck %s
-; RUN: opt < %s -S -passes='function(require<scalar-evolution>,require<targetir>,require<opt-remark-emit>,loop(unroll),verify<loops>)' | FileCheck %s
+; RUN: opt < %s -S -passes='module(require<profile-summary>,function(require<scalar-evolution>,require<targetir>,require<opt-remark-emit>,require<block-freq>,loop(unroll),verify<loops>))' | FileCheck %s
 ;
 ; Unit tests for LoopInfo::markAsRemoved.
 
Index: test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
@@ -0,0 +1,142 @@
+; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=10 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s
+
+@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
+
+; CHECK-LABEL: foo
+; CHECK: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+define i32 @foo(i32* noalias nocapture readonly %src) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+  %src_element = load i32, i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32, i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, 9
+  br i1 %exitcond86.i, label %loop.end, label %loop
+
+loop.end:
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
+
+; CHECK-LABEL: foo_prof
+; CHECK-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+define i32 @foo_prof(i32* noalias nocapture readonly %src) !prof !15 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+  %src_element = load i32, i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32, i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, 9
+  br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16
+
+loop.end:
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
+
+; CHECK-LABEL: bar
+; CHECK-NOT: loop.prol
+define i32 @bar(i32* noalias nocapture readonly %src, i64 %c) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+  %src_element = load i32, i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32, i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, %c
+  br i1 %exitcond86.i, label %loop.end, label %loop
+
+loop.end:
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
+
+; CHECK-LABEL: bar_prof
+; CHECK: loop.prol
+define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !15 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+  %src_element = load i32, i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32, i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, %c
+  br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16
+
+loop.end:
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
+
+; CHECK-LABEL: bar_prof_flat
+; CHECK-NOT: loop.prol
+define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !15 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %r  = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+  %src_element = load i32, i32* %arrayidx, align 4
+  %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
+  %const_array_element = load i32, i32* %array_const_idx, align 4
+  %mul = mul nsw i32 %src_element, %const_array_element
+  %add = add nsw i32 %mul, %r
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond86.i = icmp eq i64 %inc, %c
+  br i1 %exitcond86.i, label %loop, label %loop.end, !prof !16
+
+loop.end:
+  %r.lcssa = phi i32 [ %r, %loop ]
+  ret i32 %r.lcssa
+}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1}
+!16 = !{!"branch_weights", i32 1, i32 1000}