Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,12 @@ "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt LoopToColdBlockRatio( + "loop-to-cold-block-ratio", + cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " + "(frequency of block) is greater than this ratio"), + cl::init(5), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -804,8 +810,34 @@ for (MachineLoop *InnerLoop : L) buildLoopChains(F, *InnerLoop); + MachineBasicBlock *LoopHeader = L.getHeader(); SmallVector BlockWorkList; - BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + BlockFilterSet LoopBlockSet; + + // Filter cold blocks off from LoopBlockSet when profile data is available. + // Collect the sum of frequencies of incoming edges to the loop header from + // outside. If we treat the loop as a super block, this is the frequency of + // the loop. Then for each block in the loop, we calculate the ratio between + // its frequency and the frequency of the loop block. When it is too small, + // don't add it to the loop chain. If there are outer loops, then this block + // will be merged into the first outer loop chain for which this block is not + // cold anymore. This needs precise profile data and we only do this when + // profile data is available. + if (F.getFunction()->getEntryCount()) { + BlockFrequency LoopFreq(0); + for (auto LoopPred : LoopHeader->predecessors()) + if (!L.contains(LoopPred)) + LoopFreq += MBFI->getBlockFreq(LoopPred) * + MBPI->getEdgeProbability(LoopPred, LoopHeader); + + for (MachineBasicBlock *LoopBB : L.getBlocks()) { + auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); + if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) + continue; + LoopBlockSet.insert(LoopBB); + } + } else + LoopBlockSet.insert(L.block_begin(), L.block_end()); // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the @@ -817,7 +849,7 @@ // profitable exit block in the event that rotating the loop can eliminate // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) + if (LoopTop == LoopHeader) ExitingBB = findBestLoopExit(F, L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -828,7 +860,8 @@ SmallPtrSet UpdatedPreds; assert(LoopChain.LoopPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : L.getBlocks()) { + + for (MachineBasicBlock *LoopBB : LoopBlockSet) { BlockChain &Chain = *BlockToChain[LoopBB]; if (!UpdatedPreds.insert(&Chain).second) continue; Index: test/CodeGen/X86/code_placement_cold_loop_blocks.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -0,0 +1,122 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK + +define void @foo() !prof !1 { +; Test if a cold block in a loop will be placed at the end of the function +; chain. +; +; CHECK-LABEL: foo: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq f +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !4 + +if.then: + call void @c() + br label %if.end + +if.else: + call void @d() + br label %if.end + +if.end: + call void @e() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end, !prof !5 + +end: + call void @f() + ret void +} + +define void @nested_loop_0() !prof !1 { +; Test if a block that is cold in the inner loop but not cold in the outer loop +; will merged to the outer loop chain. +; +; CHECK-LABEL: nested_loop_0: +; CHECK: callq c +; CHECK: callq d +; CHECK: callq e +; CHECK: callq b +; CHECK: callq f + +entry: + br label %header + +header: + call void @b() + %call4 = call zeroext i1 @a() + br i1 %call4, label %header2, label %end + +header2: + call void @c() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @d() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header2, label %header, !prof !3 + +if.else: + call void @e() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %header, !prof !3 + +end: + call void @f() + ret void +} + +define void @nested_loop_1() !prof !1 { +; Test if a cold block in an inner loop will be placed at the end of the +; function chain. +; +; CHECK-LABEL: nested_loop_1: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + br label %header2 + +header2: + call void @c() + %call = call zeroext i1 @a() + br i1 %call, label %end, label %if.else, !prof !4 + +if.else: + call void @d() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %header, !prof !5 + +end: + call void @e() + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 100, i32 1} +!3 = !{!"branch_weights", i32 1, i32 10} +!4 = !{!"branch_weights", i32 1000, i32 1} +!5 = !{!"branch_weights", i32 100, i32 1}