Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -804,8 +804,34 @@ for (MachineLoop *InnerLoop : L) buildLoopChains(F, *InnerLoop); + MachineBasicBlock *LoopHeader = L.getHeader(); SmallVector<MachineBasicBlock *, 16> BlockWorkList; - BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + BlockFilterSet LoopBlockSet; + + // Filter cold blocks off from LoopBlockSet when profile data is available. + // Collect the sum of frequencies of incoming edges to the loop header from + // outside. If we treat the loop as a super block, this is the frequency of + // the loop. Then for each block in the loop, we calculate the ratio between + // its frequency and the frequency of the loop block. When it is too small, + // don't add it to the loop chain. This needs precise profile data and we + // only do this when profile data is available. + if (F.getFunction()->getEntryCount()) { + BlockFrequency LoopFreq(0); + for (auto LoopPred : LoopHeader->predecessors()) + if (!L.contains(LoopPred)) + LoopFreq += MBFI->getBlockFreq(LoopPred) * + MBPI->getEdgeProbability(LoopPred, LoopHeader); + + // We use the threshold 20% here to filter cold blocks. + const unsigned LoopToColdBBRatio = 5; + for (MachineBasicBlock *LoopBB : L.getBlocks()) { + auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); + if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBBRatio) + continue; + LoopBlockSet.insert(LoopBB); + } + } else + LoopBlockSet.insert(L.block_begin(), L.block_end()); // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the @@ -817,7 +843,7 @@ // profitable exit block in the event that rotating the loop can eliminate // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) + if (LoopTop == LoopHeader) ExitingBB = findBestLoopExit(F, L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -828,7 +854,8 @@ SmallPtrSet<BlockChain *, 4> UpdatedPreds; assert(LoopChain.LoopPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : L.getBlocks()) { + + for (MachineBasicBlock *LoopBB : LoopBlockSet) { BlockChain &Chain = *BlockToChain[LoopBB]; if (!UpdatedPreds.insert(&Chain).second) continue; Index: test/CodeGen/X86/code_placement_cold_loop_blocks.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -0,0 +1,81 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK + +define void @foo() !prof !1 { +; Test if a cold block in a loop will be placed at the end of the function +; chain. +; +; CHECK-LABEL: foo: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq f +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @c() + br label %if.end + +if.else: + call void @d() + br label %if.end + +if.end: + call void @e() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end, !prof !3 + +end: + call void @f() + ret void +} + +define void @bar() !prof !1 { +; Test if a cold block in an inner loop will be placed at the end of the +; function chain. +; +; CHECK-LABEL: bar: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + br label %header2 + +header2: + call void @c() + %call = call zeroext i1 @a() + br i1 %call, label %end, label %if.else, !prof !2 + +if.else: + call void @d() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %header, !prof !3 + +end: + call void @e() + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 1000, i32 1} +!3 = !{!"branch_weights", i32 100, i32 1}