Index: include/llvm/IR/ModuleSummaryIndex.h =================================================================== --- include/llvm/IR/ModuleSummaryIndex.h +++ include/llvm/IR/ModuleSummaryIndex.h @@ -445,7 +445,7 @@ FunctionSummary::GVFlags( GlobalValue::LinkageTypes::AvailableExternallyLinkage, /*NotEligibleToImport=*/true, /*Live=*/true, /*IsLocal=*/false), - 0, FunctionSummary::FFlags{}, std::vector(), + 0, FunctionSummary::FFlags{}, 0, std::vector(), std::move(Edges), std::vector(), std::vector(), std::vector(), @@ -465,6 +465,11 @@ /// recurses or aliases. FFlags FunFlags; + /// The synthesized entry count of the function. + /// This is only populated during ThinLink phase and remains unused while + /// generating per-module summaries. + uint64_t EntryCount = 0; + /// List of call edge pairs from this function. std::vector CallGraphEdgeList; @@ -491,14 +496,15 @@ public: FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags, - std::vector Refs, std::vector CGEdges, + uint64_t EntryCount, std::vector Refs, + std::vector CGEdges, std::vector TypeTests, std::vector TypeTestAssumeVCalls, std::vector TypeCheckedLoadVCalls, std::vector TypeTestAssumeConstVCalls, std::vector TypeCheckedLoadConstVCalls) : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)), - InstCount(NumInsts), FunFlags(FunFlags), + InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount), CallGraphEdgeList(std::move(CGEdges)) { if (!TypeTests.empty() || !TypeTestAssumeVCalls.empty() || !TypeCheckedLoadVCalls.empty() || !TypeTestAssumeConstVCalls.empty() || @@ -521,6 +527,12 @@ /// Get the instruction count recorded for this function. unsigned instCount() const { return InstCount; } + /// Get the synthetic entry count for this function. + uint64_t entryCount() const { return EntryCount; } + + /// Set the synthetic entry count for this function. + void setEntryCount(uint64_t EC) { EntryCount = EC; } + /// Return the list of pairs. ArrayRef calls() const { return CallGraphEdgeList; } @@ -1058,6 +1070,7 @@ /// GraphTraits definition to build SCC for the index template <> struct GraphTraits { typedef ValueInfo NodeRef; + using EdgeRef = FunctionSummary::EdgeTy &; static NodeRef valueInfoFromEdge(FunctionSummary::EdgeTy &P) { return P.first; @@ -1066,6 +1079,8 @@ mapped_iterator::iterator, decltype(&valueInfoFromEdge)>; + using ChildEdgeIteratorType = std::vector::iterator; + static NodeRef getEntryNode(ValueInfo V) { return V; } static ChildIteratorType child_begin(NodeRef N) { @@ -1087,6 +1102,26 @@ cast(N.getSummaryList().front()->getBaseObject()); return ChildIteratorType(F->CallGraphEdgeList.end(), &valueInfoFromEdge); } + + static ChildEdgeIteratorType child_edge_begin(NodeRef N) { + if (!N.getSummaryList().size()) // handle external function + return FunctionSummary::ExternalNode.CallGraphEdgeList.begin(); + + FunctionSummary *F = + cast(N.getSummaryList().front()->getBaseObject()); + return F->CallGraphEdgeList.begin(); + } + + static ChildEdgeIteratorType child_edge_end(NodeRef N) { + if (!N.getSummaryList().size()) // handle external function + return FunctionSummary::ExternalNode.CallGraphEdgeList.end(); + + FunctionSummary *F = + cast(N.getSummaryList().front()->getBaseObject()); + return F->CallGraphEdgeList.end(); + } + + static NodeRef edge_dest(EdgeRef E) { return E.first; } }; template <> Index: include/llvm/IR/ModuleSummaryIndexYAML.h =================================================================== --- include/llvm/IR/ModuleSummaryIndexYAML.h +++ include/llvm/IR/ModuleSummaryIndexYAML.h @@ -216,9 +216,9 @@ GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), - 0, FunctionSummary::FFlags{}, ArrayRef{}, - ArrayRef{}, std::move(FSum.TypeTests), - std::move(FSum.TypeTestAssumeVCalls), + /*NumInsts=*/0, FunctionSummary::FFlags{}, /*EntryCount=*/0, + ArrayRef{}, ArrayRef{}, + std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), std::move(FSum.TypeCheckedLoadVCalls), std::move(FSum.TypeTestAssumeConstVCalls), std::move(FSum.TypeCheckedLoadConstVCalls))); Index: include/llvm/Transforms/IPO/FunctionImport.h =================================================================== --- include/llvm/Transforms/IPO/FunctionImport.h +++ include/llvm/Transforms/IPO/FunctionImport.h @@ -126,6 +126,9 @@ /// it is an alias. Returns true if converted, false if replaced. bool convertToDeclaration(GlobalValue &GV); +/// Compute synthetic function entry counts. +void computeSyntheticCounts(ModuleSummaryIndex &Index); + /// Compute the set of summaries needed for a ThinLTO backend compilation of /// \p ModulePath. // Index: lib/Analysis/ModuleSummaryAnalysis.cpp =================================================================== --- lib/Analysis/ModuleSummaryAnalysis.cpp +++ lib/Analysis/ModuleSummaryAnalysis.cpp @@ -354,7 +354,7 @@ F.returnDoesNotAlias(), }; auto FuncSummary = llvm::make_unique( - Flags, NumInsts, FunFlags, RefEdges.takeVector(), + Flags, NumInsts, FunFlags, /*EntryCount=*/0, RefEdges.takeVector(), CallGraphEdges.takeVector(), TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(), TypeTestAssumeConstVCalls.takeVector(), @@ -466,7 +466,8 @@ F->hasFnAttribute(Attribute::ReadOnly), F->hasFnAttribute(Attribute::NoRecurse), F->returnDoesNotAlias()}, - ArrayRef{}, ArrayRef{}, + 0, ArrayRef{}, + ArrayRef{}, ArrayRef{}, ArrayRef{}, ArrayRef{}, Index: lib/Analysis/SyntheticCountsUtils.cpp =================================================================== --- lib/Analysis/SyntheticCountsUtils.cpp +++ lib/Analysis/SyntheticCountsUtils.cpp @@ -14,12 +14,12 @@ #include "llvm/Analysis/SyntheticCountsUtils.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/ModuleSummaryIndex.h" using namespace llvm; @@ -29,7 +29,7 @@ const SccTy &SCC, GetRelBBFreqTy GetRelBBFreq, GetCountTy GetCount, AddCountTy AddCount) { - SmallPtrSet SCCNodes; + DenseSet SCCNodes; SmallVector, 8> SCCEdges, NonSCCEdges; for (auto &Node : SCC) @@ -111,3 +111,4 @@ } template class llvm::SyntheticCountsUtils; +template class llvm::SyntheticCountsUtils; Index: lib/Bitcode/Reader/BitcodeReader.cpp =================================================================== --- lib/Bitcode/Reader/BitcodeReader.cpp +++ lib/Bitcode/Reader/BitcodeReader.cpp @@ -5152,9 +5152,9 @@ } const uint64_t Version = Record[0]; const bool IsOldProfileFormat = Version == 1; - if (Version < 1 || Version > 4) + if (Version < 1 || Version > 5) return error("Invalid summary version " + Twine(Version) + - ", 1, 2, 3 or 4 expected"); + ". Version should be in the range [1-5]."); Record.clear(); // Keep around the last seen summary to be used when we see an optional @@ -5257,8 +5257,8 @@ ArrayRef(Record).slice(CallGraphEdgeStartIndex), IsOldProfileFormat, HasProfile, HasRelBF); auto FS = llvm::make_unique( - Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs), - std::move(Calls), std::move(PendingTypeTests), + Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0, + std::move(Refs), std::move(Calls), std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls), std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), @@ -5329,13 +5329,18 @@ uint64_t RawFlags = Record[2]; unsigned InstCount = Record[3]; uint64_t RawFunFlags = 0; + uint64_t EntryCount = 0; unsigned NumRefs = Record[4]; int RefListStartIndex = 5; if (Version >= 4) { RawFunFlags = Record[4]; - NumRefs = Record[5]; RefListStartIndex = 6; + if (Version >= 5) { + EntryCount = Record[5]; + RefListStartIndex = 7; + } + NumRefs = Record[RefListStartIndex - 1]; } auto Flags = getDecodedGVSummaryFlags(RawFlags, Version); @@ -5350,8 +5355,8 @@ IsOldProfileFormat, HasProfile, false); ValueInfo VI = getValueInfoFromValueId(ValueID).first; auto FS = llvm::make_unique( - Flags, InstCount, getDecodedFFlags(RawFunFlags), std::move(Refs), - std::move(Edges), std::move(PendingTypeTests), + Flags, InstCount, getDecodedFFlags(RawFunFlags), EntryCount, + std::move(Refs), std::move(Edges), std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls), std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), Index: lib/Bitcode/Writer/BitcodeWriter.cpp =================================================================== --- lib/Bitcode/Writer/BitcodeWriter.cpp +++ lib/Bitcode/Writer/BitcodeWriter.cpp @@ -3494,7 +3494,7 @@ // Current version for the summary. // This is bumped whenever we introduce changes in the way some record are // interpreted, like flags for instance. -static const uint64_t INDEX_VERSION = 4; +static const uint64_t INDEX_VERSION = 5; /// Emit the per-module summary section alongside the rest of /// the module's bitcode. @@ -3638,6 +3638,7 @@ Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); @@ -3744,6 +3745,8 @@ NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); NameVals.push_back(FS->instCount()); NameVals.push_back(getEncodedFFlags(FS->fflags())); + NameVals.push_back(FS->entryCount()); + // Fill in below NameVals.push_back(0); @@ -3755,7 +3758,7 @@ NameVals.push_back(*RefValueId); Count++; } - NameVals[5] = Count; + NameVals[6] = Count; bool HasProfileData = false; for (auto &EI : FS->calls()) { Index: lib/LTO/LTO.cpp =================================================================== --- lib/LTO/LTO.cpp +++ lib/LTO/LTO.cpp @@ -1127,6 +1127,8 @@ if (!ModuleToDefinedGVSummaries.count(Mod.first)) ModuleToDefinedGVSummaries.try_emplace(Mod.first); + computeSyntheticCounts(ThinLTO.CombinedIndex); + StringMap ImportLists( ThinLTO.ModuleMap.size()); StringMap ExportLists( Index: lib/Transforms/IPO/FunctionImport.cpp =================================================================== --- lib/Transforms/IPO/FunctionImport.cpp +++ lib/Transforms/IPO/FunctionImport.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" +#include "llvm/Analysis/SyntheticCountsUtils.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/AutoUpgrade.h" #include "llvm/IR/Constants.h" @@ -120,6 +121,11 @@ ), cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'")); +static cl::opt EnableImportEntryCount( + "enable-import-entry-count", cl::init(false), cl::Hidden, + cl::desc("Attach 'synthetic_entry_count' metadata after " + "importing")); + /// Summary file to use for function importing when using -function-import from /// the command line. static cl::opt @@ -132,6 +138,8 @@ ImportAllIndex("import-all-index", cl::desc("Import all external functions in index.")); +extern cl::opt InitialSyntheticCount; + // Load lazily a module from \p FileName in \p Context. static std::unique_ptr loadFile(const std::string &FileName, LLVMContext &Context) { @@ -673,6 +681,57 @@ NumLiveSymbols += LiveSymbols; } +static void initializeCounts(ModuleSummaryIndex &Index) { + auto Root = Index.calculateCallGraphRoot(); + // Root is a fake node. All successors of are the actual roots of the + // callgraph. + // FIXME: This initializes the entry counts of only the root nodes. This makes + // sense when compiling a binary with ThinLTO, but for libraries any of the + // non-root nodes could be called from outside. + for (auto &C : Root.calls()) { + auto &V = C.first; + for (auto &GVS : V.getSummaryList()) { + auto S = GVS.get()->getBaseObject(); + auto *F = cast(S); + F->setEntryCount(InitialSyntheticCount); + } + } +} + +void llvm::computeSyntheticCounts(ModuleSummaryIndex &Index) { + using Scaled64 = ScaledNumber; + initializeCounts(Index); + auto GetCallSiteRelFreq = [](FunctionSummary::EdgeTy &Edge) { + // FIXME: Handle fake/invalid edges? + return Scaled64(Edge.second.RelBlockFreq, -CalleeInfo::ScaleShift); + }; + auto GetEntryCount = [](ValueInfo V) { + if (V.getSummaryList().size()) { + auto S = V.getSummaryList().front().get()->getBaseObject(); + auto *F = cast(S); + return F->entryCount(); + } else { + return UINT64_C(0); + } + }; + auto AddToEntryCount = [](ValueInfo V, uint64_t New) { + if (!V.getSummaryList().size()) + return; + for (auto &GVS : V.getSummaryList()) { + auto S = GVS.get()->getBaseObject(); + auto *F = cast(S); + F->setEntryCount(SaturatingAdd(F->entryCount(), New)); + } + }; + + // After initializing the counts in initializeCounts above, the counts have to + // be propagated across the combined callgraph. + // SyntheticCountsUtils::propagate takes of this propagation on any callgraph + // that specialized GraphTraits. + SyntheticCountsUtils::propagate( + &Index, GetCallSiteRelFreq, GetEntryCount, AddToEntryCount); +} + /// Compute the set of summaries needed for a ThinLTO backend compilation of /// \p ModulePath. void llvm::gatherImportedSummariesForModule( @@ -870,6 +929,14 @@ unsigned ImportedCount = 0, ImportedGVCount = 0; IRMover Mover(DestModule); + for (auto &F : DestModule) { + FunctionSummary *FS = + dyn_cast_or_null(Index.findSummaryInModule( + F.getGUID(), DestModule.getModuleIdentifier())); + if (FS && !F.isDeclaration() && EnableImportEntryCount) + F.setEntryCount( + Function::ProfileCount(FS->entryCount(), Function::PCT_Synthetic)); + } // Do the actual import of functions now, one Module at a time std::set ModuleNameOrderedList; for (auto &FunctionsToImportPerModule : ImportList) { @@ -905,6 +972,13 @@ if (Import) { if (Error Err = F.materialize()) return std::move(Err); + auto *Summary = Index.findSummaryInModule(GUID, Name); + FunctionSummary *FS = + dyn_cast(Summary->getBaseObject()); + if (FS && !F.isDeclaration() && EnableImportEntryCount) { + F.setEntryCount(Function::ProfileCount(FS->entryCount(), + Function::PCT_Synthetic)); + } if (EnableImportMetadata) { // Add 'thinlto_src_module' metadata for statistics and debugging. F.setMetadata( Index: lib/Transforms/IPO/SyntheticCountsPropagation.cpp =================================================================== --- lib/Transforms/IPO/SyntheticCountsPropagation.cpp +++ lib/Transforms/IPO/SyntheticCountsPropagation.cpp @@ -46,7 +46,7 @@ #define DEBUG_TYPE "synthetic-counts-propagation" /// Initial synthetic count assigned to functions. -static cl::opt +cl::opt InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10), cl::ZeroOrMore, cl::desc("Initial value of synthetic entry count.")); Index: test/Bitcode/summary_version.ll =================================================================== --- test/Bitcode/summary_version.ll +++ test/Bitcode/summary_version.ll @@ -2,7 +2,7 @@ ; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s ; CHECK: +; CHECK: Index: test/Bitcode/thinlto-alias.ll =================================================================== --- test/Bitcode/thinlto-alias.ll +++ test/Bitcode/thinlto-alias.ll @@ -27,7 +27,7 @@ ; COMBINED-NEXT: ; COMBINED-NEXT: -; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED-NEXT: ; ModuleID = 'thinlto-function-summary-callgraph.ll' Index: test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll =================================================================== --- test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll +++ test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll @@ -52,7 +52,7 @@ ; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED_NEXT: Index: test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll =================================================================== --- test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll +++ test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll @@ -58,7 +58,7 @@ ; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED_NEXT: Index: test/Bitcode/thinlto-function-summary-callgraph.ll =================================================================== --- test/Bitcode/thinlto-function-summary-callgraph.ll +++ test/Bitcode/thinlto-function-summary-callgraph.ll @@ -33,7 +33,7 @@ ; COMBINED-NEXT: +; COMBINED-NEXT: ; COMBINED-NEXT: ; ModuleID = 'thinlto-function-summary-callgraph.ll' Index: test/ThinLTO/X86/Inputs/function_entry_count.ll =================================================================== --- /dev/null +++ test/ThinLTO/X86/Inputs/function_entry_count.ll @@ -0,0 +1,9 @@ +target triple = "x86_64-unknown-linux-gnu" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @h(); + +define void @g() { + call void @h(); + ret void +} Index: test/ThinLTO/X86/function_entry_count.ll =================================================================== --- /dev/null +++ test/ThinLTO/X86/function_entry_count.ll @@ -0,0 +1,42 @@ +; RUN: opt -thinlto-bc %s -write-relbf-to-summary -thin-link-bitcode-file=%t1.thinlink.bc -o %t1.bc +; RUN: opt -thinlto-bc %p/Inputs/function_entry_count.ll -write-relbf-to-summary -thin-link-bitcode-file=%t2.thinlink.bc -o %t2.bc + +; First perform the thin link on the normal bitcode file. +; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps -enable-import-entry-count \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t1.bc,g, \ +; RUN: -r=%t1.bc,f,px \ +; RUN: -r=%t1.bc,h,px \ +; RUN: -r=%t2.bc,h, \ +; RUN: -r=%t2.bc,g,px +; RUN: opt -function-import -import-all-index -enable-import-metadata -summary-file %t1.bc.thinlto.bc %t1.bc -o %t1.out -enable-import-entry-count +; RUN: llvm-dis -o - %t1.out | FileCheck %s + +; CHECK: define void @h() !prof ![[PROF2:[0-9]+]] +; CHECK: define void @f(i32 %n) !prof ![[PROF1:[0-9]+]] +; CHECK: define available_externally void @g() !prof ![[PROF2]] +; CHECK-DAG: ![[PROF1]] = !{!"synthetic_function_entry_count", i64 10} +; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 198} + +target triple = "x86_64-unknown-linux-gnu" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @g(); + +define void @h() { + ret void +} + +define void @f(i32 %n) { +entry: + %cmp = icmp slt i32 %n, 1 + br i1 %cmp, label %exit, label %loop +loop: + %n1 = phi i32 [%n, %entry], [%n2, %loop] + call void @g() + %n2 = sub i32 %n1, 1 + %cmp2 = icmp slt i32 %n, 1 + br i1 %cmp2, label %exit, label %loop +exit: + ret void +}