Index: llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp +++ llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp @@ -1411,9 +1411,16 @@ continue; auto *OrigBB = cast(Entry.first); auto *ClonedBB = cast(Entry.second); - ClonedBBs.insert(ClonedBB); - CallerBFI->setBlockFreq(ClonedBB, - CalleeBFI->getBlockFreq(OrigBB).getFrequency()); + uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency(); + if (!ClonedBBs.insert(ClonedBB).second) { + // Multiple blocks in the callee might get mapped to one cloned block in + // the caller since we prune the callee as we clone it. When that happens, + // we want to use the maximum among the original blocks' frequencies. + uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency(); + if (NewFreq > Freq) + Freq = NewFreq; + } + CallerBFI->setBlockFreq(ClonedBB, Freq); } BasicBlock *EntryClone = cast(VMap.lookup(&CalleeEntryBlock)); CallerBFI->setBlockFreqAndScale( Index: llvm/trunk/test/Transforms/Inline/bfi-update.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/bfi-update.ll +++ llvm/trunk/test/Transforms/Inline/bfi-update.ll @@ -0,0 +1,93 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S -inline-threshold=50 -inline-cold-callsite-threshold=0 -hot-callsite-threshold=50 | FileCheck %s +; This tests incremental updates to caller's BFI as a callee gets inlined. +; In bottom-up inlining, first c->e inlining is considered and fails because +; e's size exceeds the threshold of 50. Then a->c inlining is considered and it +; succeeds. a's BFI is updated incrementally. As c's blocks get pruned, the +; block with label cond_false is removed and since the remanining code is +; straight-line a single block gets cloned into a. This block should get the +; maximum block frequency among the original blocks in c. If it gets the +; frequency of the block with label cond_true in @c, its frequency will be +; 1/10th of function a's entry block frequency, resulting in a callsite count of +; 2 (since a's entry count is 20) which means that a->e callsite will be +; considered cold and not inlined. + +@data = external global i32 +; CHECK-LABEL: define i32 @a( +define i32 @a(i32 %a1) !prof !21 { +; CHECK-NOT: call i32 @c +; CHECK-NOT: call i32 @e +; CHECK: ret +entry: + %cond = icmp sle i32 %a1, 1 + %a2 = call i32 @c(i32 1) + br label %exit +exit: + ret i32 %a2 +} + +declare void @ext(); + +; CHECK: @c(i32 %c1) !prof [[COUNT1:![0-9]+]] +define i32 @c(i32 %c1) !prof !23 { + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false, !prof !25 + +cond_false: + br label %exit + +cond_true: + %c11 = call i32 @e(i32 %c1) + br label %exit +exit: + %c12 = phi i32 [ 0, %cond_false], [ %c11, %cond_true ] + ret i32 %c12 +} + + +; CHECK: @e(i32 %c1) !prof [[COUNT2:![0-9]+]] +define i32 @e(i32 %c1) !prof !24 { + call void @ext() + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + call void @ext() + %c2 = load i32, i32* @data, align 4 + %c3 = add i32 %c1, %c2 + %c4 = mul i32 %c3, %c2 + %c5 = add i32 %c4, %c2 + %c6 = mul i32 %c5, %c2 + %c7 = add i32 %c6, %c2 + %c8 = mul i32 %c7, %c2 + %c9 = add i32 %c8, %c2 + %c10 = mul i32 %c9, %c2 + ret i32 %c10 + +cond_true: + ret i32 0 +} + +; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 480} +; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 80} +!21 = !{!"function_entry_count", i64 20} +!23 = !{!"function_entry_count", i64 500} +!24 = !{!"function_entry_count", i64 100} +!25 = !{!"branch_weights", i32 1, i32 9} + +!llvm.module.flags = !{!1} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 1000, i32 1} +!13 = !{i32 999000, i64 1000, i32 1} +!14 = !{i32 999999, i64 5, i32 2}