Index: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h @@ -37,6 +37,7 @@ class AliasSet; class AliasSetTracker; class BasicBlock; +class BlockFrequencyInfo; class DataLayout; class Loop; class LoopInfo; @@ -114,26 +115,26 @@ /// reverse depth first order w.r.t the DominatorTree. This allows us to visit /// uses before definitions, allowing us to sink a loop body in one pass without /// iteration. Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, -/// DataLayout, TargetLibraryInfo, Loop, AliasSet information for all +/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all /// instructions of the loop and loop safety information as /// arguments. Diagnostics is emitted via \p ORE. It returns changed status. bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, TargetTransformInfo *, Loop *, - AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *, + BlockFrequencyInfo *, TargetLibraryInfo *, TargetTransformInfo *, + Loop *, AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); /// Walk the specified region of the CFG (defined by all blocks /// dominated by the specified block, and that are in the current loop) in depth /// first order w.r.t the DominatorTree. This allows us to visit definitions /// before uses, allowing us to hoist a loop body in one pass without iteration. -/// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, DataLayout, +/// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, BlockFrequencyInfo, /// TargetLibraryInfo, Loop, AliasSet information for all instructions of the /// loop and loop safety information as arguments. Diagnostics is emitted via \p /// ORE. It returns changed status. bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, Loop *, AliasSetTracker *, - MemorySSAUpdater *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *); + BlockFrequencyInfo *, TargetLibraryInfo *, Loop *, AliasSetTracker *, + MemorySSAUpdater *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *); /// This function deletes dead loops. The caller of this function needs to /// guarantee that the loop is infact dead. Index: llvm/trunk/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LICM.cpp +++ llvm/trunk/lib/Transforms/Scalar/LICM.cpp @@ -95,6 +95,11 @@ "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); +static cl::opt HoistSinkColdnessThreshold( + "licm-coldness-threshold", cl::Hidden, cl::init(4), + cl::desc("Relative coldness Threshold of hoisting/sinking destination " + "block for LICM to be considered beneficial")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -139,8 +144,9 @@ BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -168,8 +174,8 @@ struct LoopInvariantCodeMotion { using ASTrackerMapTy = DenseMap>; bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA, + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST); ASTrackerMapTy &getLoopToAliasSetMap() { return LoopToAliasSetMap; } @@ -220,6 +226,7 @@ &getAnalysis().getAAResults(), &getAnalysis().getLoopInfo(), &getAnalysis().getDomTree(), + &getAnalysis().getBFI(), &getAnalysis().getTLI(), &getAnalysis().getTTI( *L->getHeader()->getParent()), @@ -230,6 +237,7 @@ /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addRequired(); @@ -286,7 +294,8 @@ "cached at a higher level"); LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); - if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE, + auto BFI = FAM.getCachedResult(*F); + if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, ORE, true)) return PreservedAnalyses::all(); @@ -324,8 +333,9 @@ /// bool LoopInvariantCodeMotion::runOnLoop( Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, - TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, - MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) { + BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, + bool DeleteAST) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -385,11 +395,11 @@ LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true}; if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any @@ -491,9 +501,10 @@ /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, Loop *CurLoop, - AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, AliasSetTracker *CurAST, + MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { @@ -542,7 +553,7 @@ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && !I.mayHaveSideEffects()) { - if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) { + if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FreeInLoop) { ++II; eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); @@ -786,13 +797,43 @@ }; } // namespace +// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only +// only worthwhile if the destination block is actually colder than current +// block. +static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, + OptimizationRemarkEmitter *ORE, + BlockFrequencyInfo *BFI) { + // Check block frequency only when runtime profile is available. + // to avoid pathological cases. With static profile, lean towards + // hosting because it helps canonicalize the loop for vectorizer. + if (!DstBlock->getParent()->hasProfileData()) + return true; + + if (!HoistSinkColdnessThreshold || !BFI) + return true; + + BasicBlock *SrcBlock = I.getParent(); + if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > + BFI->getBlockFreq(SrcBlock).getFrequency()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) + << "failed to sink or hoist instruction because containing block " + "has lower frequency than destination block"; + }); + return false; + } + + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + DominatorTree *DT, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -843,13 +884,15 @@ // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. // TODO: It may be safe to hoist if we are hoisting to a conditional block // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags, ORE) && + worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -1550,8 +1593,9 @@ /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + BlockFrequencyInfo *BFI, const Loop *CurLoop, + ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I) @@ -1627,7 +1671,10 @@ // If this instruction is only used outside of the loop, then all users are // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. + // First check if I is worth sinking for all uses. Sink only when it is worth + // across all uses. SmallSetVector Users(I.user_begin(), I.user_end()); + SmallVector ExitPNs; for (auto *UI : Users) { auto *User = cast(UI); @@ -1637,6 +1684,15 @@ PHINode *PN = cast(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); + + if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { + return Changed; + } + + ExitPNs.push_back(PN); + } + + for (auto *PN: ExitPNs) { // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU); Index: llvm/trunk/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/trunk/test/Other/opt-O2-pipeline.ll +++ llvm/trunk/test/Other/opt-O2-pipeline.ll @@ -102,7 +102,11 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction @@ -154,12 +158,15 @@ ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -246,10 +253,13 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/trunk/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/trunk/test/Other/opt-O3-pipeline.ll +++ llvm/trunk/test/Other/opt-O3-pipeline.ll @@ -107,7 +107,11 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction @@ -159,12 +163,15 @@ ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -251,10 +258,13 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/trunk/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/trunk/test/Other/opt-Os-pipeline.ll +++ llvm/trunk/test/Other/opt-Os-pipeline.ll @@ -89,7 +89,11 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Rotate Loops +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unswitch loops ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction @@ -141,12 +145,15 @@ ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Post-Dominator Tree Construction @@ -233,10 +240,13 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions +; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Loop Invariant Code Motion ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/trunk/test/Other/pass-pipelines.ll =================================================================== --- llvm/trunk/test/Other/pass-pipelines.ll +++ llvm/trunk/test/Other/pass-pipelines.ll @@ -52,7 +52,7 @@ ; CHECK-O2-NEXT: FunctionPass Manager ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Pass Manager -; CHECK-O2-NOT: Manager +; Requiring block frequency for LICM will place ICM and rotation under separate Loop Pass Manager ; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and ; causing new loop pass managers. ; CHECK-O2: Simplify the CFG Index: llvm/trunk/test/Transforms/LICM/sink.ll =================================================================== --- llvm/trunk/test/Transforms/LICM/sink.ll +++ llvm/trunk/test/Transforms/LICM/sink.ll @@ -1,8 +1,10 @@ -; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK ; RUN: opt -S < %s -passes='require,loop(licm),loop-sink' \ ; RUN: | FileCheck %s --check-prefix=CHECK-SINK -; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; Original source code: ; int g; @@ -29,6 +31,10 @@ ; CHECK-LICM: load i32, i32* @g ; CHECK-LICM: br label %.lr.ph +; CHECK-BFI-LICM: .lr.ph.preheader: +; CHECK-BFI-LICM-NOT: load i32, i32* @g +; CHECK-BFI-LICM: br label %.lr.ph + .lr.ph: %.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ] %.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ]