diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -26,6 +26,7 @@ class AliasSet; class AliasSetTracker; class BasicBlock; +class BlockFrequencyInfo; class IRBuilderBase; class Loop; class LoopInfo; @@ -123,12 +124,13 @@ /// reverse depth first order w.r.t the DominatorTree. This allows us to visit /// uses before definitions, allowing us to sink a loop body in one pass without /// iteration. Takes DomTreeNode, AAResults, LoopInfo, DominatorTree, -/// TargetLibraryInfo, Loop, AliasSet information for all +/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all /// instructions of the loop and loop safety information as /// arguments. Diagnostics is emitted via \p ORE. It returns changed status. bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, TargetTransformInfo *, Loop *, - AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *, + BlockFrequencyInfo *, TargetLibraryInfo *, + TargetTransformInfo *, Loop *, AliasSetTracker *, + MemorySSAUpdater *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); /// Walk the specified region of the CFG (defined by all blocks @@ -136,13 +138,14 @@ /// first order w.r.t the DominatorTree. This allows us to visit definitions /// before uses, allowing us to hoist a loop body in one pass without iteration. /// Takes DomTreeNode, AAResults, LoopInfo, DominatorTree, -/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the -/// loop and loop safety information as arguments. Diagnostics is emitted via \p -/// ORE. It returns changed status. +/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all +/// instructions of the loop and loop safety information as arguments. +/// Diagnostics is emitted via \p ORE. It returns changed status. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, Loop *, AliasSetTracker *, - MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); + BlockFrequencyInfo *, TargetLibraryInfo *, Loop *, + AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *, + ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *); /// This function deletes dead loops. The caller of this function needs to /// guarantee that the loop is infact dead. diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -2429,9 +2429,11 @@ return Err; // Add the nested pass manager with the appropriate adaptor. bool UseMemorySSA = (Name == "loop-mssa"); - FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), UseMemorySSA, /*UseBlockFrequencyInfo=*/false, - DebugLogging)); + bool UseBFI = + std::any_of(InnerPipeline.begin(), InnerPipeline.end(), + [](auto Pipeline) { return Pipeline.Name == "licm"; }); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA, + UseBFI, DebugLogging)); return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -99,6 +100,11 @@ "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); +static cl::opt HoistSinkColdnessThreshold( + "licm-coldness-threshold", cl::Hidden, cl::init(4), + cl::desc("Relative coldness Threshold of hoisting/sinking destination " + "block for LICM to be considered beneficial")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -144,8 +150,9 @@ MemorySSAUpdater *MSSAU, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -356,12 +363,13 @@ LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true}; if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); + Changed |= + sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L, + CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) Changed |= - hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any @@ -458,10 +466,10 @@ /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, Loop *CurLoop, - AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, AliasSetTracker *CurAST, + MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { @@ -510,7 +518,7 @@ isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE)) { - if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) { + if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; salvageDebugInfo(I); @@ -755,13 +763,43 @@ }; } // namespace +// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only +// only worthwhile if the destination block is actually colder than current +// block. +static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, + OptimizationRemarkEmitter *ORE, + BlockFrequencyInfo *BFI) { + // Check block frequency only when runtime profile is available + // to avoid pathological cases. With static profile, lean towards + // hosting because it helps canonicalize the loop for vectorizer. + if (!DstBlock->getParent()->hasProfileData()) + return true; + + if (!HoistSinkColdnessThreshold || !BFI) + return true; + + BasicBlock *SrcBlock = I.getParent(); + if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > + BFI->getBlockFreq(SrcBlock).getFrequency()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) + << "failed to sink or hoist instruction because containing block " + "has lower frequency than destination block"; + }); + return false; + } + + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -812,13 +850,15 @@ // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. // TODO: It may be safe to hoist if we are hoisting to a conditional block // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && + worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -1554,8 +1594,9 @@ /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) @@ -1631,7 +1672,10 @@ // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. + // First check if I is worth sinking for all uses. Sink only when it is worth + // across all uses. SmallSetVector Users(I.user_begin(), I.user_end()); + SmallVector ExitPNs; for (auto *UI : Users) { auto *User = cast(UI); @@ -1641,6 +1685,15 @@ PHINode *PN = cast(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); + if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { + return Changed; + } + + ExitPNs.push_back(PN); + } + + for (auto *PN : ExitPNs) { + // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); diff --git a/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof @@ -0,0 +1,7 @@ +_Z3fooii:200:1 + 0: 1 + 1: 1 _Z3bari:1 + 2: 200 + 3: 200 + 4: 0 + 5: 1 diff --git a/llvm/test/Transforms/LICM/no-hoist-prof.ll b/llvm/test/Transforms/LICM/no-hoist-prof.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/no-hoist-prof.ll @@ -0,0 +1,88 @@ +; RUN: opt -enable-new-pm=1 -sample-profile -licm -S -sample-profile-file='%S/Inputs/no-hoist-prof.prof' < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM +; RUN: opt -passes=licm -S < %s | FileCheck %s --check-prefix=CHECK-LICM + +; Original source code: +; +; int bar(int); +; int foo(int iter, int explode) { +; int base = bar(explode); +; for (int i = 0; i != iter; ++i) +; if (i == explode) +; iter = (base * base) + bar(iter); +; return iter; +; } + +; We need debug information in this .ll in order to leverage the pgo file, so: +; .ll generated by running `clang++ -O3 -g -S -emit-llvm`, then: +; - move hoisted mul back into cold section +; - give labels names +; - reindex variables +; - remove metadata calls, attributes, module header +; - remove unnecessary metadata + +; CHECK-LICM: .l.check.preheader:{{.*}} +; CHECK-LICM-NEXT: {{.*}} = mul {{.*}} +; CHECK-LICM-NEXT: br{{.*}} + +; CHECK-BFI-LICM: .l.cold:{{.*}} +; CHECK-BFI-LICM-NEXT: {{.*}} = mul {{.*}} + +define dso_local i32 @_Z3fooii(i32, i32) local_unnamed_addr #0 !dbg !7 { + %3 = tail call i32 @_Z3bari(i32 %1), !dbg !19 + %4 = icmp eq i32 %0, 0, !dbg !22 + br i1 %4, label %.l.ret, label %.l.check.preheader, !dbg !24 + +.l.check.preheader: + br label %.l.check, !dbg !24 + +.l.ret: + %5 = phi i32 [ 0, %2 ], [ %12, %.l.iterate ] + ret i32 %5, !dbg !25 + +.l.check: + %6 = phi i32 [ 0, %.l.check.preheader ], [ %13, %.l.iterate ] + %7 = phi i32 [ %0, %.l.check.preheader ], [ %12, %.l.iterate ] + %8 = icmp eq i32 %6, %1, !dbg !26 + br i1 %8, label %.l.cold, label %.l.iterate, !dbg !28 + +.l.cold: + %9 = mul nsw i32 %3, %3 + %10 = tail call i32 @_Z3bari(i32 %7), !dbg !29 + %11 = add nsw i32 %10, %9, !dbg !30 + br label %.l.iterate, !dbg !31 + +.l.iterate: + %12 = phi i32 [ %11, %.l.cold ], [ %7, %.l.check ] + %13 = add nuw nsw i32 %6, 1, !dbg !32 + %14 = icmp eq i32 %13, %12, !dbg !22 + br i1 %14, label %.l.ret, label %.l.check, !dbg !24, !llvm.loop !33 +} + +attributes #0 = { "use-sample-profile" } + +declare dso_local i32 @_Z3bari(i32) local_unnamed_addr #1 + +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.20181009 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None) +!1 = !DIFile(filename: "foo.cpp", directory: "/tmp/gather_pgo") +!4 = !{i32 2, !"Debug Info Version", i32 3} +!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !10, !10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!16 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 3) +!19 = !DILocation(line: 3, column: 14, scope: !7) +!22 = !DILocation(line: 4, column: 21, scope: !23) +!23 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 3) +!24 = !DILocation(line: 4, column: 3, scope: !16) +!25 = !DILocation(line: 7, column: 3, scope: !7) +!26 = !DILocation(line: 5, column: 11, scope: !27) +!27 = distinct !DILexicalBlock(scope: !23, file: !1, line: 5, column: 9) +!28 = !DILocation(line: 5, column: 9, scope: !23) +!29 = !DILocation(line: 6, column: 30, scope: !27) +!30 = !DILocation(line: 6, column: 28, scope: !27) +!31 = !DILocation(line: 6, column: 7, scope: !27) +!32 = !DILocation(line: 4, column: 30, scope: !23) +!33 = distinct !{!33, !24, !34} +!34 = !DILocation(line: 6, column: 38, scope: !16) diff --git a/llvm/test/Transforms/LICM/sink.ll b/llvm/test/Transforms/LICM/sink.ll --- a/llvm/test/Transforms/LICM/sink.ll +++ b/llvm/test/Transforms/LICM/sink.ll @@ -1,8 +1,10 @@ -; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK ; RUN: opt -S < %s -passes='require,loop(licm),loop-sink' \ ; RUN: | FileCheck %s --check-prefix=CHECK-SINK -; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; Original source code: ; int g; @@ -29,6 +31,10 @@ ; CHECK-LICM: load i32, i32* @g ; CHECK-LICM: br label %.lr.ph +; CHECK-BFI-LICM: .lr.ph.preheader: +; CHECK-BFI-LICM-NOT: load i32, i32* @g +; CHECK-BFI-LICM: br label %.lr.ph + .lr.ph: %.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ] %.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ]