diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1394,8 +1394,10 @@ return Err; // Add the nested pass manager with the appropriate adaptor. bool UseMemorySSA = (Name == "loop-mssa"); - bool UseBFI = llvm::any_of( - InnerPipeline, [](auto Pipeline) { return Pipeline.Name == "licm"; }); + bool UseBFI = llvm::any_of(InnerPipeline, [](auto Pipeline) { + return Pipeline.Name.contains("licm") || + Pipeline.Name.contains("simple-loop-unswitch"); + }); bool UseBPI = llvm::any_of(InnerPipeline, [](auto Pipeline) { return Pipeline.Name == "loop-predication"; }); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GuardUtils.h" @@ -26,6 +27,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3044,6 +3046,7 @@ bool NonTrivial, function_ref)> UnswitchCB, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, function_ref DestroyLoopCB) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); @@ -3080,6 +3083,14 @@ if (L.getHeader()->getParent()->hasOptSize()) return false; + // Skip cold loops, as unswitching them brings little benefit + // but increases the code size + if (PSI && PSI->hasProfileSummary() && BFI && + PSI->isColdBlock(L.getHeader(), BFI)) { + LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); + return false; + } + // Skip non-trivial unswitching for loops that cannot be cloned. if (!L.isSafeToClone()) return false; @@ -3105,7 +3116,11 @@ LPMUpdater &U) { Function &F = *L.getHeader()->getParent(); (void)F; - + ProfileSummaryInfo *PSI = nullptr; + if (auto OuterProxy = + AM.getResult(L, AR) + .getCachedResult(F)) + PSI = OuterProxy->getCachedResult(*F.getParent()); LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); @@ -3152,7 +3167,7 @@ } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, UnswitchCB, &AR.SE, MSSAU ? MSSAU.getPointer() : nullptr, - DestroyLoopCB)) + PSI, AR.BFI, DestroyLoopCB)) return PreservedAnalyses::all(); if (AR.MSSA && VerifyMemorySSA) @@ -3214,7 +3229,6 @@ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n"); - auto &DT = getAnalysis().getDomTree(); auto &LI = getAnalysis().getLoopInfo(); auto &AC = getAnalysis().getAssumptionCache(F); @@ -3251,9 +3265,9 @@ if (VerifyMemorySSA) MSSA->verifyMemorySSA(); - - bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, - UnswitchCB, SE, &MSSAU, DestroyLoopCB); + bool Changed = + unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE, + &MSSAU, nullptr, nullptr, DestroyLoopCB); if (VerifyMemorySSA) MSSA->verifyMemorySSA(); diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -174,6 +174,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -137,6 +137,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -110,6 +110,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -119,6 +119,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -148,6 +148,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -114,6 +114,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LICM ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass diff --git a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll b/llvm/test/Transforms/LoopPredication/preserve-bpi.ll --- a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll +++ b/llvm/test/Transforms/LoopPredication/preserve-bpi.ll @@ -10,6 +10,7 @@ ; CHECK: Running pass: LoopPredicationPass on Loop at depth 1 ; CHECK-NEXT: Running pass: LICMPass on Loop at depth 1 ; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on Loop at depth 1 +; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-NEXT: Running pass: LoopPredicationPass on Loop at depth 1 ; CHECK-NEXT: Running pass: LICMPass on Loop at depth 1 ; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on Loop at depth 1 diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + +; RUN: opt < %s -passes='require,function(loop-mssa(simple-loop-unswitch))' -S | FileCheck %s +; This test checks for a crash. +; RUN: opt < %s -passes=simple-loop-unswitch -aa-pipeline= -disable-output + +declare i32 @a() +declare i32 @b() + +define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !0 { +; CHECK-LABEL: @f1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[ENTRY_HOT_LOOP:%.*]] +; CHECK: entry_hot_loop: +; CHECK-NEXT: br i1 [[HOT_COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER:%.*]], label [[HOT_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] +; CHECK: hot_loop_begin.preheader: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] +; CHECK: hot_loop_begin.preheader.split.us: +; CHECK-NEXT: br label [[HOT_LOOP_BEGIN_US:%.*]] +; CHECK: hot_loop_begin.us: +; CHECK-NEXT: br label [[HOT_LOOP_A_US:%.*]] +; CHECK: hot_loop_a.us: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() +; CHECK-NEXT: br label [[HOT_LOOP_LATCH_US:%.*]] +; CHECK: hot_loop_latch.us: +; CHECK-NEXT: [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1 +; CHECK-NEXT: br i1 [[V1_US]], label [[HOT_LOOP_BEGIN_US]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] +; CHECK: hot_loop_exit.loopexit.split.us: +; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK: hot_loop_begin.preheader.split: +; CHECK-NEXT: br label [[HOT_LOOP_BEGIN:%.*]] +; CHECK: hot_loop_begin: +; CHECK-NEXT: br label [[HOT_LOOP_B:%.*]] +; CHECK: hot_loop_b: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() +; CHECK-NEXT: br label [[HOT_LOOP_LATCH:%.*]] +; CHECK: hot_loop_latch: +; CHECK-NEXT: [[V1:%.*]] = load i1, i1* [[PTR]], align 1 +; CHECK-NEXT: br i1 [[V1]], label [[HOT_LOOP_BEGIN]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] +; CHECK: hot_loop_exit.loopexit.split: +; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT]] +; CHECK: hot_loop_exit.loopexit: +; CHECK-NEXT: br label [[HOT_LOOP_EXIT]] +; CHECK: hot_loop_exit: +; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] +; CHECK: entry_cold_loop: +; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF16:![0-9]+]] +; CHECK: cold_loop_begin.preheader: +; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] +; CHECK: cold_loop_begin: +; CHECK-NEXT: br i1 [[COND]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] +; CHECK: cold_loop_a: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a() +; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] +; CHECK: cold_loop_b: +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @b() +; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] +; CHECK: cold_loop_latch: +; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR]], align 1 +; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK: cold_loop_exit.loopexit: +; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] +; CHECK: cold_loop_exit: +; CHECK-NEXT: ret void +; +entry: + br label %entry_hot_loop + +entry_hot_loop: + br i1 %hot_cond, label %hot_loop_begin, label %hot_loop_exit, !prof !15 + +hot_loop_begin: + br i1 %cond, label %hot_loop_a, label %hot_loop_b + +hot_loop_a: + call i32 @a() + br label %hot_loop_latch + +hot_loop_b: + call i32 @b() + br label %hot_loop_latch + +hot_loop_latch: + %v1 = load i1, i1* %ptr + br i1 %v1, label %hot_loop_begin, label %hot_loop_exit + +hot_loop_exit: + br label %entry_cold_loop + +entry_cold_loop: + br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !16 + +cold_loop_begin: + br i1 %cond, label %cold_loop_a, label %cold_loop_b + +cold_loop_a: + call i32 @a() + br label %cold_loop_latch + +cold_loop_b: + call i32 @b() + br label %cold_loop_latch + +cold_loop_latch: + %v2 = load i1, i1* %ptr + br i1 %v2, label %cold_loop_begin, label %cold_loop_exit + +cold_loop_exit: + ret void +} + +!llvm.module.flags = !{!1} +!0 = !{!"function_entry_count", i64 400} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 10} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"branch_weights", i32 100, i32 0} +!16 = !{!"branch_weights", i32 0, i32 100} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll @@ -18,6 +18,7 @@ ; the analysis caches. ; ; CHECK: Running pass: SimpleLoopUnswitchPass on Loop at depth 1 containing: %loop_begin
,%loop_b,%loop_b_inner,%loop_b_inner_exit,%loop_a,%loop_a_inner,%loop_a_inner_exit,%latch +; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-NEXT: Clearing all analysis results for: loop_a_inner