diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -177,9 +177,11 @@ *GetAssumptionCache = nullptr, ProfileSummaryInfo *PSI = nullptr, BlockFrequencyInfo *CallerBFI = nullptr, - BlockFrequencyInfo *CalleeBFI = nullptr) + BlockFrequencyInfo *CalleeBFI = nullptr, + bool UpdateProfile = true) : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI), - CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {} + CallerBFI(CallerBFI), CalleeBFI(CalleeBFI), + UpdateProfile(UpdateProfile) {} /// If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. @@ -203,6 +205,10 @@ /// `InlinedCalls` above is used. SmallVector InlinedCallSites; + /// Update profile for callee as well as cloned version. We need to do this + /// for regular inlining, but not for inlining from sample profile loader. + bool UpdateProfile; + void reset() { StaticAllocas.clear(); InlinedCalls.clear(); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1533,6 +1533,11 @@ auto PrevCount = getEntryCount(); assert(!PrevCount.hasValue() || PrevCount.getType() == Count.getType()); #endif + + auto ImportGUIDs = getImportGUIDs(); + if (S == nullptr && ImportGUIDs.size()) + S = &ImportGUIDs; + MDBuilder MDB(getContext()); setMetadata( LLVMContext::MD_prof, diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -873,7 +873,7 @@ << "incompatible inlining"); return false; } - InlineFunctionInfo IFI(nullptr, &GetAC); + InlineFunctionInfo IFI(nullptr, &GetAC, nullptr, nullptr, nullptr, false); if (InlineFunction(CS, IFI)) { // The call to InlineFunction erases I, so we can't pass it here. ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1508,8 +1508,6 @@ else newEntryCount = priorEntryCount + entryDelta; - Callee->setEntryCount(newEntryCount); - // During inlining ? if (VMap) { uint64_t cloneEntryCount = priorEntryCount - newEntryCount; @@ -1518,12 +1516,17 @@ if (auto *CI = dyn_cast_or_null(Entry.second)) CI->updateProfWeight(cloneEntryCount, priorEntryCount); } - for (BasicBlock &BB : *Callee) - // No need to update the callsite if it is pruned during inlining. - if (!VMap || VMap->count(&BB)) - for (Instruction &I : BB) - if (CallInst *CI = dyn_cast(&I)) - CI->updateProfWeight(newEntryCount, priorEntryCount); + + if (entryDelta) { + Callee->setEntryCount(newEntryCount); + + for (BasicBlock &BB : *Callee) + // No need to update the callsite if it is pruned during inlining. + if (!VMap || VMap->count(&BB)) + for (Instruction &I : BB) + if (CallInst *CI = dyn_cast(&I)) + CI->updateProfWeight(newEntryCount, priorEntryCount); + } } /// This function inlines the called function into the basic block of the @@ -1716,13 +1719,15 @@ // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; - if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr) + if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr && + IFI.UpdateProfile) // Update the BFI of blocks cloned into the caller. updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, CalledFunc->front()); - updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, - IFI.PSI, IFI.CallerBFI); + if (IFI.UpdateProfile) + updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, + IFI.PSI, IFI.CallerBFI); // Inject byval arguments initialization. for (std::pair &Init : ByValInit) diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof @@ -0,0 +1,10 @@ +test_sample_loader_inline:63067:0 + 1: sample_loader_inlinee:40000 +sample_loader_inlinee:3000:0 + 1: direct_leaf_func:35000 + 11: 3000 +test_cgscc_inline:63067:0 + 1: sample_loader_inlinee:1 +cgscc_inlinee:3000:0 + 1: direct_leaf_func:35000 + 11: 3000 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/inline-callee-update.ll b/llvm/test/Transforms/SampleProfile/inline-callee-update.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/inline-callee-update.ll @@ -0,0 +1,71 @@ +; facebook T56498894: Make sure Import GUID list for ThinLTO properly maintained while update function's entry count for inlining + +; RUN: opt < %s -passes='thinlto-pre-link' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/inline-callee-update.prof -S | FileCheck %s + +define i32* @sample_loader_inlinee() !dbg !6{ + %1 = call i32* @direct_leaf_func(i32* null), !dbg !7 + %cmp = icmp ne i32* %1, null + br i1 %cmp, label %then, label %else + +then: + %2 = load i32* ()*, i32* ()** @z, align 8, !dbg !8 + %3 = call i32* %2(), !dbg !8 + ret i32* %3 + +else: + ret i32* null +} + +define i32* @cgscc_inlinee() !dbg !12{ + %1 = call i32* @direct_leaf_func(i32* null), !dbg !13 + %cmp = icmp ne i32* %1, null + br i1 %cmp, label %then, label %else + +then: + %2 = load i32* ()*, i32* ()** @y, align 8, !dbg !14 + %3 = call i32* %2(), !dbg !14 + ret i32* %3 + +else: + ret i32* null +} + +define i32* @test_sample_loader_inline(void ()*) !dbg !3 { + %2 = call i32* @sample_loader_inlinee(), !dbg !4 + ret i32* %2 +} + +define i32* @test_cgscc_inline(void ()*) !dbg !9 { + %2 = call i32* @cgscc_inlinee(), !dbg !10 + ret i32* %2 +} + +@y = global i32* ()* null, align 8 +@z = global i32* ()* null, align 8 + +declare i32* @direct_leaf_func(i32* %x) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1) +!1 = !DIFile(filename: "test.cc", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "test_sample_loader_inline", scope: !1, file: !1, line: 3, unit: !0) +!4 = !DILocation(line: 4, scope: !3) +!5 = !DILocation(line: 6, scope: !3) +!6 = distinct !DISubprogram(name: "sample_loader_inlinee", scope: !1, file: !1, line: 11, unit: !0) +!7 = !DILocation(line: 12, scope: !6) +!8 = !DILocation(line: 13, scope: !6) +!9 = distinct !DISubprogram(name: "test_cgscc_inline", scope: !1, file: !1, line: 20, unit: !0) +!10 = !DILocation(line: 21, scope: !9) +!11 = !DILocation(line: 23, scope: !9) +!12 = distinct !DISubprogram(name: "cgscc_inlinee", scope: !1, file: !1, line: 31, unit: !0) +!13 = !DILocation(line: 32, scope: !12) +!14 = !DILocation(line: 33, scope: !12) + +; Make sure the ImportGUID stays with entry count metadata for ThinLTO-PreLink +; CHECK: distinct !DISubprogram(name: "sample_loader_inlinee" +; CHECK-NEXT: {!"function_entry_count", i64 1, i64 -9171813444624716006} +; CHECK: distinct !DISubprogram(name: "cgscc_inlinee" +; CHECK-NEXT: !{!"function_entry_count", i64 0, i64 -9171813444624716006}