Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -25,6 +25,7 @@ class AliasSetTracker; class AssumptionCache; class BasicBlock; +class BlockFrequencyInfo; class DataLayout; class DominatorTree; class Loop; @@ -357,9 +358,9 @@ /// Takes DomTreeNode, AliasAnalysis, LoopInfo, DominatorTree, DataLayout, /// TargetLibraryInfo, Loop, AliasSet information for all instructions of the /// loop and loop safety information as arguments. It returns changed status. -bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *, - TargetLibraryInfo *, Loop *, AliasSetTracker *, - LICMSafetyInfo *); +bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, + BlockFrequencyInfo *, DominatorTree *, TargetLibraryInfo *, + Loop *, AliasSetTracker *, LICMSafetyInfo *); /// \brief Try to promote memory values to scalars by sinking stores out of /// the loop and moving loads to before the loop. We do this by looping over Index: lib/Transforms/Scalar/LICM.cpp =================================================================== --- lib/Transforms/Scalar/LICM.cpp +++ lib/Transforms/Scalar/LICM.cpp @@ -34,6 +34,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -121,6 +122,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -135,6 +137,7 @@ AliasAnalysis *AA; // Current AliasAnalysis information LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. + BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. @@ -164,6 +167,7 @@ INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -182,6 +186,7 @@ LI = &getAnalysis().getLoopInfo(); AA = &getAnalysis().getAAResults(); DT = &getAnalysis().getDomTree(); + BFI = &getAnalysis().getBFI(); TLI = &getAnalysis().getTLI(); @@ -212,7 +217,7 @@ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, CurAST, &SafetyInfo); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, BFI, DT, TLI, CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any @@ -328,7 +333,8 @@ /// uses, allowing us to hoist a loop body in one pass without iteration. /// bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, - DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + BlockFrequencyInfo *BFI, DominatorTree *DT, + TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && @@ -337,6 +343,10 @@ BasicBlock *BB = N->getBlock(); + bool ShouldHoist = true; + if (BFI->getBlockFreq(BB) < BFI->getBlockFreq(CurLoop->getLoopPreheader())) + ShouldHoist = false; + // If this subregion is not in the top level loop at all, exit. if (!CurLoop->contains(BB)) return false; @@ -359,6 +369,8 @@ I.eraseFromParent(); continue; } + if (!ShouldHoist) + continue; // Try hoisting the instruction out to the preheader. We can only do this // if all of the operands of the instruction are loop invariant and if it @@ -374,7 +386,8 @@ const std::vector &Children = N->getChildren(); for (DomTreeNode *Child : Children) - Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + Changed |= + hoistRegion(Child, AA, LI, BFI, DT, TLI, CurLoop, CurAST, SafetyInfo); return Changed; } Index: test/Other/pass-pipelines.ll =================================================================== --- test/Other/pass-pipelines.ll +++ test/Other/pass-pipelines.ll @@ -37,6 +37,12 @@ ; CHECK-O2-NEXT: FunctionPass Manager ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Pass Manager +; CHECK-O2: Branch Probability Analysis +; CHECK-O2: Block Frequency Analysis +; CHECK-O2: Loop Pass Manager +; CHECK-O2: Loop Invariant Code Motion +; CHECK-O2: Loop Pass Manager +; CHECK-O2: Unswitch loops ; CHECK-O2-NOT: Manager ; FIXME: We shouldn't be pulling out to simplify-cfg and instcombine and ; causing new loop pass managers. Index: test/Transforms/LICM/not-hoist-low-freq.ll =================================================================== --- /dev/null +++ test/Transforms/LICM/not-hoist-low-freq.ll @@ -0,0 +1,81 @@ +; RUN: opt -S -licm < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Original source code: +; int g, x; +; void foo(int p) { +; for (int i = 0; i != x; i++) +; if (__builtin_expect(i == p, 0)) { +; x += g; x *= g; +; } +; } +; +; Load of global value g should not be hoisted to preheader. + +@g = global i32 0, align 4 +@x = global i32 0, align 4 + +; Function Attrs: nounwind uwtable +define void @_Z3fooi(i32 %p) #0 { +entry: + %p.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %p, i32* %p.addr, align 4, !tbaa !1 + %0 = bitcast i32* %i to i8* + store i32 0, i32* %i, align 4, !tbaa !1 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4, !tbaa !1 + %2 = load i32, i32* @x, align 4, !tbaa !1 + %cmp = icmp ne i32 %1, %2 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %3 = bitcast i32* %i to i8* + ret void + +for.body: ; preds = %for.cond + %4 = load i32, i32* %i, align 4, !tbaa !1 + %5 = load i32, i32* %p.addr, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %4, %5 + %conv = zext i1 %cmp1 to i64 + %tobool = icmp ne i64 %conv, 0 + br i1 %tobool, label %if.then, label %for.inc, !prof !5 + +if.then: ; preds = %for.body + %6 = load i32, i32* @g, align 4, !tbaa !1 + %7 = load i32, i32* @x, align 4, !tbaa !1 + %add = add nsw i32 %7, %6 + store i32 %add, i32* @x, align 4, !tbaa !1 + %8 = load i32, i32* @g, align 4, !tbaa !1 + %9 = load i32, i32* @x, align 4, !tbaa !1 + %mul = mul nsw i32 %9, %8 + store i32 %mul, i32* @x, align 4, !tbaa !1 + br label %for.inc + +; CHECK: if.then: +; CHECK: load i32, i32* @g +; CHECK: br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %10 = load i32, i32* %i, align 4, !tbaa !1 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %i, align 4, !tbaa !1 + br label %for.cond +} + +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind readnone } +attributes #3 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.9.0 (trunk 268579) (llvm/trunk 268587)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} +!5 = !{!"branch_weights", i32 1, i32 2000}