Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -89,7 +89,7 @@ "loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), - cl::init(5), cl::Hidden); + cl::init(160), cl::Hidden); static cl::opt PreciseRotationCost("precise-rotation-cost", @@ -2138,25 +2138,24 @@ BlockFilterSet LoopBlockSet; // Filter cold blocks off from LoopBlockSet when profile data is available. - // Collect the sum of frequencies of incoming edges to the loop header from - // outside. If we treat the loop as a super block, this is the frequency of - // the loop. Then for each block in the loop, we calculate the ratio between - // its frequency and the frequency of the loop block. When it is too small, + // For each block in the loop, we calculate the estimated number of times + // it will execute for each iteration of the loop. When it is too small, // don't add it to the loop chain. If there are outer loops, then this block // will be merged into the first outer loop chain for which this block is not - // cold anymore. This needs precise profile data and we only do this when - // profile data is available. + // cold anymore. By default, we only do this when profile data is available. if (F->getFunction()->getEntryCount()) { - BlockFrequency LoopFreq(0); - for (auto LoopPred : L.getHeader()->predecessors()) - if (!L.contains(LoopPred)) - LoopFreq += MBFI->getBlockFreq(LoopPred) * - MBPI->getEdgeProbability(LoopPred, L.getHeader()); - + BlockFrequency LoopFreq = MBFI->getBlockFreq(L.getHeader()); + DEBUG(dbgs() << "Finding loop blocks for loop with frequency " + << LoopFreq.getFrequency() << "\n"); for (MachineBasicBlock *LoopBB : L.getBlocks()) { auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); - if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) + if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) { + DEBUG(dbgs() << "Excluding " << getBlockName(LoopBB) + << " with frequency " << Freq + << " from loop with frequency " << LoopFreq.getFrequency() + << "\n"); continue; + } LoopBlockSet.insert(LoopBB); } } else Index: test/CodeGen/X86/code_placement_cold_loop_blocks.ll =================================================================== --- test/CodeGen/X86/code_placement_cold_loop_blocks.ll +++ test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -1,4 +1,5 @@ -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NOPRECISE +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -force-evict-cold-blocks-from-loops -force-loop-cold-block < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PRECISE define void @foo() !prof !1 { ; Test if a cold block in a loop will be placed at the end of the function @@ -42,10 +43,11 @@ ; will merged to the outer loop chain. ; ; CHECK-LABEL: nested_loop_0: +; PRECISE: callq b ; CHECK: callq c ; CHECK: callq d ; CHECK: callq e -; CHECK: callq b +; NOPRECISE: callq b ; CHECK: callq f entry: @@ -107,6 +109,99 @@ ret void } +define void @cold_block_in_hot_loop() !prof !1 { +; Test that a cold block gets moved out of a high-trip-count loop. +; +; CHECK-LABEL: cold_block_in_hot_loop +; NOPRECISE: callq b +; NOPRECISE: callq d +; NOPRECISE: callq e +; NOPRECISE: callq c +; With precise rotation cost, the %header -> %cold edge (10 occurrences) +; is preferred over the %entry -> %header edge (1 occurrence). +; PRECISE: callq d +; PRECISE: callq b +; PRECISE: callq c +; PRECISE: callq e +entry: + br label %header + +header: + call void @b() + %call = call i1 @a() + br i1 %call, label %cold, label %after.cold, !prof !7 + +cold: + call void @c() + br label %after.cold + +after.cold: + call void @d() + %cont = call i1 @a() + br i1 %cont, label %header, label %done, !prof !8 + +done: + call void @e() + ret void +} + +define void @no_profile_data() { +; Test that we sensibly place a cold block within a loop even when no +; overall profile data is available, when precise loop rotation cost +; and cold loop block eviction are enabled. +; +; CHECK-LABEL: no_profile_data: +; NOPRECISE: callq a +; NOPRECISE: callq d +; NOPRECISE: callq g +; NOPRECISE: callq b +; NOPRECISE: callq c +; PRECISE: callq a +; PRECISE: callq c +; PRECISE: callq g +; PRECISE: callq b +; PRECISE: callq d +entry: + br label %for.outer.body + +for.outer.body: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.outer.inc ] + %br = call i1 @a() + br label %for.inner.body + +for.inner.body: + %i.inner = phi i32 [ 0, %for.outer.body ], [ %i.inc, %for.inner.inc ] + br i1 %br, label %if.then, label %if.else + +if.then: + %br.unlikely = call i1 @g() + br i1 %br.unlikely, label %unlikely, label %if.end, !prof !6 + +unlikely: + call void @d() + br label %if.end + +if.end: + call void @b() + br label %for.inner.inc + +if.else: + call void @c() + br label %for.inner.inc + +for.inner.inc: + %i.inc = add i32 %i.inner, 1 + %done.inner = icmp eq i32 %i.inc, 1000 + br i1 %done.inner, label %for.outer.inc, label %for.inner.body + +for.outer.inc: + %done.outer = icmp eq i32 %i.inc, 1000000 + br i1 %done.outer, label %ret, label %for.outer.body + +ret: + ret void +} + declare zeroext i1 @a() declare void @b() declare void @c() @@ -113,6 +208,7 @@ declare void @d() declare void @e() declare void @f() +declare zeroext i1 @g() !1 = !{!"function_entry_count", i64 1} !2 = !{!"branch_weights", i32 100, i32 1} @@ -119,3 +215,6 @@ !3 = !{!"branch_weights", i32 1, i32 10} !4 = !{!"branch_weights", i32 1000, i32 1} !5 = !{!"branch_weights", i32 100, i32 1} +!6 = !{!"branch_weights", i32 1, i32 2000} +!7 = !{!"branch_weights", i32 1, i32 100000} +!8 = !{!"branch_weights", i32 1000000, i32 1}