Index: docs/BranchWeightMetadata.rst =================================================================== --- docs/BranchWeightMetadata.rst +++ docs/BranchWeightMetadata.rst @@ -123,11 +123,11 @@ optimization, ``MD_prof`` nodes can also be assigned to a function definition. The first operand is a string indicating the name of the associated counter. -Currently, one counter is supported: "function_entry_count". This is a 64-bit -counter that indicates the number of times that this function was invoked (in -the case of instrumentation-based profiles). In the case of sampling-based -profiles, this counter is an approximation of how many times the function was -invoked. +Currently, one counter is supported: "function_entry_count". The second operand +is a 64-bit counter that indicates the number of times that this function was +invoked (in the case of instrumentation-based profiles). In the case of +sampling-based profiles, this operand is an approximation of how many times +the function was invoked. For example, in the code below, the instrumentation for function foo() indicates that it was called 2,590 times at runtime. @@ -138,3 +138,10 @@ ret i32 0 } !1 = !{!"function_entry_count", i64 2590} + +If "function_entry_count" has more than 2 operands, the later operands are +the GUID of the functions that needs to be imported by ThinLTO. This is only +set by sampling based profile. The reason that we cannot annotate this on the +callsite is that it can only goes down 1 level in the call chain. For the cases +where foo_in_a_cc()->bar_in_b_cc()->baz_in_c_cc(), we will need to go down 2 +levels in the call chain to import both bar_in_b_cc and baz_in_c_cc. Index: include/llvm/IR/Function.h =================================================================== --- include/llvm/IR/Function.h +++ include/llvm/IR/Function.h @@ -18,6 +18,7 @@ #ifndef LLVM_IR_FUNCTION_H #define LLVM_IR_FUNCTION_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" #include "llvm/ADT/StringRef.h" @@ -207,8 +208,11 @@ /// \brief Set the entry count for this function. /// /// Entry count is the number of times this function was executed based on - /// pgo data. - void setEntryCount(uint64_t Count); + /// pgo data. \p Imports points to a set of GUIDs that needs to be imported + /// by the function for sample PGO, to enable the same inlines as the + /// profiled optimized binary. + void setEntryCount(uint64_t Count, + const DenseSet *Imports = nullptr); /// \brief Get the entry count for this function. /// @@ -216,6 +220,10 @@ /// pgo data. Optional getEntryCount() const; + /// Returns the set of GUIDs that needs to be imported to the function for + /// sample PGO, to enable the same inlines as the profiled optimized binary. + DenseSet getImportGUIDs() const; + /// Set the section prefix for this function. void setSectionPrefix(StringRef Prefix); Index: include/llvm/IR/MDBuilder.h =================================================================== --- include/llvm/IR/MDBuilder.h +++ include/llvm/IR/MDBuilder.h @@ -15,7 +15,9 @@ #ifndef LLVM_IR_MDBUILDER_H #define LLVM_IR_MDBUILDER_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/Support/DataTypes.h" #include @@ -63,8 +65,11 @@ /// Return metadata specifying that a branch or switch is unpredictable. MDNode *createUnpredictable(); - /// Return metadata containing the entry count for a function. - MDNode *createFunctionEntryCount(uint64_t Count); + /// Return metadata containing the entry \p Count for a function, and the + /// GUIDs stored in \p Imports that need to be imported for sample PGO, to + /// enable the same inlines as the profiled optimized binary + MDNode *createFunctionEntryCount(uint64_t Count, + const DenseSet *Imports); /// Return metadata containing the section prefix for a function. MDNode *createFunctionSectionPrefix(StringRef Prefix); Index: include/llvm/ProfileData/SampleProf.h =================================================================== --- include/llvm/ProfileData/SampleProf.h +++ include/llvm/ProfileData/SampleProf.h @@ -15,8 +15,11 @@ #ifndef LLVM_PROFILEDATA_SAMPLEPROF_H_ #define LLVM_PROFILEDATA_SAMPLEPROF_H_ +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/raw_ostream.h" @@ -300,6 +303,20 @@ return Result; } + /// Recursively traverses all children, if the corresponding function is + /// not defined in module \p M, and its total sample is no less than + /// \p Threshold, add its corresponding GUID to \p S. + void findImportedFunctions(DenseSet &S, const Module *M, + uint64_t Threshold) const { + if (TotalSamples <= Threshold) + return; + Function *F = M->getFunction(Name); + if (!F || !F->getSubprogram()) + S.insert(Function::getGUID(Name)); + for (auto CS : CallsiteSamples) + CS.second.findImportedFunctions(S, M, Threshold); + } + /// Set the name of the function. void setName(StringRef FunctionName) { Name = FunctionName; } Index: lib/Analysis/ModuleSummaryAnalysis.cpp =================================================================== --- lib/Analysis/ModuleSummaryAnalysis.cpp +++ lib/Analysis/ModuleSummaryAnalysis.cpp @@ -183,6 +183,11 @@ } } + // Explicit add hot edges to enforce importing for designated GUIDs for + // sample PGO, to enable the same inlines as the profiled optimized binary. + for (auto &I : F.getImportGUIDs()) + CallGraphEdges[I].updateHotness(CalleeInfo::HotnessType::Hot); + bool NonRenamableLocal = isNonRenamableLocal(F); bool NotEligibleForImport = NonRenamableLocal || HasInlineAsmMaybeReferencingInternal || Index: lib/IR/Function.cpp =================================================================== --- lib/IR/Function.cpp +++ lib/IR/Function.cpp @@ -1279,9 +1279,10 @@ setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit)); } -void Function::setEntryCount(uint64_t Count) { +void Function::setEntryCount(uint64_t Count, + const DenseSet *S) { MDBuilder MDB(getContext()); - setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count)); + setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S)); } Optional Function::getEntryCount() const { @@ -1298,6 +1299,18 @@ return None; } +DenseSet Function::getImportGUIDs() const { + DenseSet R; + if (MDNode *MD = getMetadata(LLVMContext::MD_prof)) + if (MDString *MDS = dyn_cast(MD->getOperand(0))) + if (MDS->getString().equals("function_entry_count")) + for (unsigned i = 2; i < MD->getNumOperands(); i++) + R.insert(mdconst::extract(MD->getOperand(i)) + ->getValue() + .getZExtValue()); + return R; +} + void Function::setSectionPrefix(StringRef Prefix) { MDBuilder MDB(getContext()); setMetadata(LLVMContext::MD_section_prefix, Index: lib/IR/MDBuilder.cpp =================================================================== --- lib/IR/MDBuilder.cpp +++ lib/IR/MDBuilder.cpp @@ -56,11 +56,16 @@ return MDNode::get(Context, None); } -MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) { +MDNode *MDBuilder::createFunctionEntryCount( + uint64_t Count, const DenseSet *Imports) { Type *Int64Ty = Type::getInt64Ty(Context); - return MDNode::get(Context, - {createString("function_entry_count"), - createConstant(ConstantInt::get(Int64Ty, Count))}); + SmallVector Ops; + Ops.push_back(createString("function_entry_count")); + Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count))); + if (Imports) + for (auto ID : *Imports) + Ops.push_back(createConstant(ConstantInt::get(Int64Ty, ID))); + return MDNode::get(Context, Ops); } MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) { Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -1650,8 +1650,8 @@ for (const auto &Pair : MDs) { if (Pair.first == LLVMContext::MD_prof) { MDNode *MD = Pair.second; - Assert(MD->getNumOperands() == 2, - "!prof annotations should have exactly 2 operands", MD); + Assert(MD->getNumOperands() >= 2, + "!prof annotations should have no less than 2 operands", MD); // Check first operand. Assert(MD->getOperand(0) != nullptr, "first operand should not be null", Index: lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- lib/Transforms/IPO/SampleProfile.cpp +++ lib/Transforms/IPO/SampleProfile.cpp @@ -163,7 +163,8 @@ ErrorOr getBlockWeight(const BasicBlock *BB); const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineHotFunctions(Function &F); + bool inlineHotFunctions(Function &F, + DenseSet &ImportGUIDs); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); @@ -603,9 +604,12 @@ /// it to direct call. Each indirect call is limited with a single target. /// /// \param F function to perform iterative inlining. +/// \param ImportGUIDs a set to be updated to include all GUIDs that come +/// from a different module but inlined in the profiled binary. /// /// \returns True if there is any inline happened. -bool SampleProfileLoader::inlineHotFunctions(Function &F) { +bool SampleProfileLoader::inlineHotFunctions( + Function &F, DenseSet &ImportGUIDs) { DenseSet PromotedInsns; bool Changed = false; LLVMContext &Ctx = F.getContext(); @@ -654,8 +658,12 @@ continue; } } - if (!CalledFunction || !CalledFunction->getSubprogram()) + if (!CalledFunction || !CalledFunction->getSubprogram()) { + findCalleeFunctionSamples(*I)->findImportedFunctions( + ImportGUIDs, F.getParent(), + Samples->getTotalSamples() * SampleProfileHotThreshold / 100); continue; + } DebugLoc DLoc = I->getDebugLoc(); uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples(); if (InlineFunction(CallSite(DI), IFI)) { @@ -1040,10 +1048,6 @@ bool Changed = true; unsigned I = 0; - // Add an entry count to the function using the samples gathered - // at the function entry. - F.setEntryCount(Samples->getHeadSamples() + 1); - // If BB weight is larger than its corresponding loop's header BB weight, // use the BB weight to replace the loop header BB weight. for (auto &BI : F) { @@ -1272,12 +1276,19 @@ DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() << ": " << getFunctionLoc(F) << "\n"); - Changed |= inlineHotFunctions(F); + DenseSet ImportGUIDs; + Changed |= inlineHotFunctions(F, ImportGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { + // Add an entry count to the function using the samples gathered at the + // function entry. Also sets the GUIDs that comes from a different + // module but inlined in the profiled binary. This is aiming at making + // the IR match the profiled binary before annotation. + F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs); + // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); Index: test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll =================================================================== --- test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll +++ test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK-LABEL: ; CHECK-LABEL: ; COMBINED: