diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Instructions.h" #include "llvm/ProfileData/SampleProf.h" @@ -90,6 +91,8 @@ // calling context and the context is identified by path from root to the node. class SampleContextTracker { public: + using ContextSamplesTy = SmallSet; + SampleContextTracker(StringMap &Profiles); // Query context profile for a specific callee with given name at a given // call-site. The full context is identified by location of call instruction. @@ -103,6 +106,9 @@ FunctionSamples *getContextSamplesFor(const DILocation *DIL); // Query context profile for a given sample contxt of a function. FunctionSamples *getContextSamplesFor(const SampleContext &Context); + // Get all context profile for given function. + ContextSamplesTy &getAllContextSamplesFor(const Function &Func); + ContextSamplesTy &getAllContextSamplesFor(StringRef Name); // Query base profile for a given function. A base profile is a merged view // of all context profiles for contexts that are not inlined. FunctionSamples *getBaseSamplesFor(const Function &Func, @@ -113,6 +119,9 @@ // This makes sure that inlined context profile will be excluded in // function's base profile. void markContextSamplesInlined(const FunctionSamples *InlinedSamples); + void promoteMergeContextSamplesTree(const Instruction &Inst, + StringRef CalleeName); + void addCallGraphEdges(CallGraph &CG, StringMap &SymbolMap); // Dump the internal context profile trie. void dump(); @@ -126,8 +135,6 @@ ContextTrieNode *getTopLevelContextNode(StringRef FName); ContextTrieNode &addTopLevelContextNode(StringRef FName); ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo); - void promoteMergeContextSamplesTree(const Instruction &Inst, - StringRef CalleeName); void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode, StringRef ContextStrToRemove); ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode, @@ -135,7 +142,7 @@ StringRef ContextStrToRemove); // Map from function name to context profiles (excluding base profile) - StringMap> FuncToCtxtProfileSet; + StringMap FuncToCtxtProfileSet; // Root node for context trie tree ContextTrieNode RootContext; diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -263,6 +263,17 @@ return Node->getFunctionSamples(); } +SampleContextTracker::ContextSamplesTy & +SampleContextTracker::getAllContextSamplesFor(const Function &Func) { + StringRef CanonName = FunctionSamples::getCanonicalFnName(Func); + return FuncToCtxtProfileSet[CanonName]; +} + +SampleContextTracker::ContextSamplesTy & +SampleContextTracker::getAllContextSamplesFor(StringRef Name) { + return FuncToCtxtProfileSet[Name]; +} + FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func, bool MergeContext) { StringRef CanonName = FunctionSamples::getCanonicalFnName(Func); @@ -550,4 +561,25 @@ return *ToNode; } +// Replace call graph edges with dynamic call edges from the profile. +void SampleContextTracker::addCallGraphEdges(CallGraph &CG, + StringMap &SymbolMap) { + // Add profile call edges to the call graph. + std::queue NodeQueue; + NodeQueue.push(&RootContext); + while (!NodeQueue.empty()) { + ContextTrieNode *Node = NodeQueue.front(); + NodeQueue.pop(); + Function *F = SymbolMap.lookup(Node->getFuncName()); + for (auto &I : Node->getAllChildContext()) { + ContextTrieNode *ChildNode = &I.second; + NodeQueue.push(ChildNode); + if (F && !F->isDeclaration()) { + Function *Callee = SymbolMap.lookup(ChildNode->getFuncName()); + if (Callee && !Callee->isDeclaration()) + CG[F]->addCalledFunction(nullptr, CG[Callee]); + } + } + } +} } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -177,6 +177,16 @@ "order of call graph during sample profile loading. It only " "works for new pass manager. ")); +static cl::opt UseProfileIndirectCallEdges( + "use-profile-indirect-call-edges", cl::init(true), cl::Hidden, + cl::desc("Considering indirect call samples from profile when top-down " + "processing functions. Only CSSPGO is supported.")); + +static cl::opt UseProfileTopDownOrder( + "use-profile-top-down-order", cl::init(false), cl::Hidden, + cl::desc("Process functions in one SCC in a top-down order " + "based on the input profile.")); + static cl::opt ProfileSizeInline( "sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " @@ -532,6 +542,8 @@ const SmallVectorImpl &Candidates, const Function &F, bool Hot); std::vector buildFunctionOrder(Module &M, CallGraph *CG); + void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples); + void replaceCallGraphEdges(CallGraph &CG, StringMap &SymbolMap); void generateMDProfMetadata(Function &F); /// Map from function name to Function *. Used to find the function from @@ -2341,6 +2353,45 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) +// Add inlined profile call edges to the call graph. +void SampleProfileLoader::addCallGraphEdges(CallGraph &CG, + const FunctionSamples &Samples) { + Function *Caller = SymbolMap.lookup(Samples.getFuncName()); + if (!Caller || Caller->isDeclaration()) + return; + + // Skip non-inlined call edges which are not important since top down inlining + // for non-CS profile is to get more precise profile matching, not to enable + // more inlining. + + for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { + for (const auto &InlinedSamples : CallsiteSamples.second) { + Function *Callee = SymbolMap.lookup(InlinedSamples.first); + if (Callee && !Callee->isDeclaration()) + CG[Caller]->addCalledFunction(nullptr, CG[Callee]); + addCallGraphEdges(CG, InlinedSamples.second); + } + } +} + +// Replace call graph edges with dynamic call edges from the profile. +void SampleProfileLoader::replaceCallGraphEdges( + CallGraph &CG, StringMap &SymbolMap) { + // Remove static call edges from the call graph except for the ones from the + // root which make the call graph connected. + for (const auto &Node : CG) + if (Node.second.get() != CG.getExternalCallingNode()) + Node.second->removeAllCalledFunctions(); + + // Add profile call edges to the call graph. + if (ProfileIsCS) { + ContextTracker->addCallGraphEdges(CG, SymbolMap); + } else { + for (const auto &Samples : Reader->getProfiles()) + addCallGraphEdges(CG, Samples.second); + } +} + std::vector SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { std::vector FunctionOrderList; @@ -2363,16 +2414,96 @@ } assert(&CG->getModule() == &M); + + // Add indirect call edges from profile to augment the static call graph. + // Functions will be processed in a top-down order defined by the static call + // graph. Adjusting the order by considering indirect call edges from the + // profile (which don't exist in the static call graph) can enable the + // inlining of indirect call targets by processing the caller before them. + // TODO: enable this for non-CS profile and fix the counts returning logic to + // have a full support for indirect calls. + if (UseProfileIndirectCallEdges && ProfileIsCS) { + for (auto &Entry : *CG) { + const auto *F = Entry.first; + if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile")) + continue; + auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName()); + if (AllContexts.empty()) + continue; + + for (const auto &BB : *F) { + for (const auto &I : BB.getInstList()) { + const auto *CB = dyn_cast(&I); + if (!CB || !CB->isIndirectCall()) + continue; + const DebugLoc &DLoc = I.getDebugLoc(); + if (!DLoc) + continue; + auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc); + for (FunctionSamples *Samples : AllContexts) { + if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) { + for (const auto &Target : CallTargets.get()) { + Function *Callee = SymbolMap.lookup(Target.first()); + if (Callee && !Callee->isDeclaration()) + Entry.second->addCalledFunction(nullptr, (*CG)[Callee]); + } + } + } + } + } + } + } + + // Compute a top-down order the profile which is used to sort functions in + // one SCC later. The static processing order computed for an SCC may not + // reflect the call contexts in the context-sensitive profile, thus may cause + // potential inlining to be overlooked. The function order in one SCC is being + // adjusted to a top-down order based on the profile to favor more inlining. + DenseMap ProfileOrderMap; + if (UseProfileTopDownOrder || + (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) { + // Create a static call graph. The call edges are not important since they + // will be replaced by dynamic edges from the profile. + CallGraph ProfileCG(M); + replaceCallGraphEdges(ProfileCG, SymbolMap); + scc_iterator CGI = scc_begin(&ProfileCG); + uint64_t I = 0; + while (!CGI.isAtEnd()) { + for (CallGraphNode *Node : *CGI) { + if (auto *F = Node->getFunction()) + ProfileOrderMap[F] = ++I; + } + ++CGI; + } + } + scc_iterator CGI = scc_begin(CG); while (!CGI.isAtEnd()) { - for (CallGraphNode *node : *CGI) { - auto F = node->getFunction(); + uint64_t Start = FunctionOrderList.size(); + for (CallGraphNode *Node : *CGI) { + auto *F = Node->getFunction(); if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) FunctionOrderList.push_back(F); } + + // Sort nodes in SCC based on the profile top-down order. + if (!ProfileOrderMap.empty()) { + llvm::sort(FunctionOrderList.begin() + Start, FunctionOrderList.end(), + [&ProfileOrderMap](Function *Left, Function *Right) { + return ProfileOrderMap[Left] < ProfileOrderMap[Right]; + }); + } + ++CGI; } + LLVM_DEBUG({ + dbgs() << "Function processing order:\n"; + for (auto F : reverse(FunctionOrderList)) { + dbgs() << F->getName() << "\n"; + } + }); + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); return FunctionOrderList; } @@ -2525,6 +2656,7 @@ } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { + LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n"); DILocation2SampleMap.clear(); // By default the entry count is initialized to -1, which will be treated // conservatively by getEntryCount as the same as unknown (None). This is diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof @@ -0,0 +1,38 @@ +[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11 + 0: 6 + 1: 6 + 3: 287884 + 15: 23 +[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20 + 0: 15 + 1: 15 + 3: 74946 + 10: 23324 + 15: 11 +[main]:154:0 + 2: 12 + 3: 18 _Z5funcAi:11 + 3.1: 18 _Z5funcBi:19 +[external:12 @ main]:154:12 + 2: 12 + 3: 10 _Z5funcAi:7 + 3.1: 10 _Z5funcBi:11 +[main:3.1 @ _Z5funcBi]:120:19 + 0: 19 + 1: 19 _Z8funcLeafi:20 + 3: 12 +[externalA:17 @ _Z5funcBi]:120:3 + 0: 3 + 1: 3 +[external:10 @ _Z5funcBi]:120:10 + 0: 10 + 1: 10 +[main:3 @ _Z5funcAi]:99:11 + 0: 10 + 1: 10 _Z8funcLeafi:11 + 2: 287864 _Z3fibi:315608 + 3: 24 +[main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608 + 0: 362839 + 1: 6 + 3: 287884 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof @@ -0,0 +1,36 @@ +_Z8funcLeafi:500853:20 + 0: 15 + 1: 15 + 3: 74946 + 10: 23324 + 15: 11 +main:154:0 + 2: 12 + 3: 18 _Z5funcAi:11 + 3.1: 18 _Z5funcBi:19 +main:154:12 + 2: 12 + 3: 10 _Z5funcAi:7 + 3.1: 10 _Z5funcBi:11 +_Z5funcBi:120:19 + 0: 19 + 1: 19 _Z8funcLeafi:20 + 3: 12 +_Z5funcBi:120:3 + 0: 3 + 1: 3 +_Z5funcBi:120:10 + 0: 10 + 1: 10 +_Z5funcAi:99:11 + 0: 10 + 1: _Z8funcLeafi:40 + 0: 6 + 1: 6 + 3: 2 + 15: 23 + 2: 315608 _Z3fibi:362839 + 0: 315608 + 1: 6 + 3: 287884 + 3: 24 diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll @@ -0,0 +1,190 @@ +;; Test for different function processing orders affecting inlining in sample profile loader. + +;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program. +;; With -use-profile-top-down-order=0, the top-down processing order of +;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on +;; the static call graph. With -use-profile-top-down-order=1, call edges +;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi) +;; which leads to _Z8funcLeafi inlined into _Z5funcAi. +; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE +; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE + +;; There is an indirect call _Z5funcAi -> _Z3fibi in the program. +;; With -use-profile-indirect-call-edges=0, the processing order computed +;; based on the static call graph is (_Z3fibi, _Z5funcAi). With +;; -use-profile-top-down-order=1, the indirect call edge from profile is +;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to +;; _Z3fibi inlined into _Z5funcAi. +; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE +; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-NOINLINE + +@factor = dso_local global i32 3, align 4, !dbg !0 +@fp = dso_local global i32 (i32)* null, align 8 + +define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { +entry: + store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25 + br label %for.body, !dbg !25 + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3, !dbg !27 + +for.body: ; preds = %for.body, %entry + %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] + %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 + %add = add nuw nsw i32 %x.011, 1, !dbg !31 + %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 + %add2 = add i32 %call, %r.010, !dbg !34 + %add3 = add i32 %add2, %call1, !dbg !35 + %dec = add nsw i32 %x.011, -1, !dbg !36 + %cmp = icmp eq i32 %x.011, 0, !dbg !38 + br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25 +} + +; INLINE: define dso_local i32 @_Z5funcAi +; INLINE-NOT: call i32 @_Z8funcLeafi +; NOINLINE: define dso_local i32 @_Z5funcAi +; NOINLINE: call i32 @_Z8funcLeafi +; ICALL-INLINE: define dso_local i32 @_Z5funcAi +; ICALL-INLINE: call i32 @_Z3foo +; ICALL-NOINLINE: define dso_local i32 @_Z5funcAi +; ICALL-NOINLINE-NO: call i32 @_Z3foo +; ICALL-NOINLINE-NO: call i32 @_Z3fibi +define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 { +entry: + %add = add nsw i32 %x, 100000, !dbg !44 + %0 = load i32 (i32)*, i32 (i32)** @fp, align 8 + %call = call i32 %0(i32 8), !dbg !45 + %call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46 + ret i32 %call, !dbg !46 +} + +; INLINE: define dso_local i32 @_Z8funcLeafi +; NOINLINE: define dso_local i32 @_Z8funcLeafi +; ICALL-INLINE: define dso_local i32 @_Z8funcLeafi +; ICALL-NOINLINE: define dso_local i32 @_Z8funcLeafi +define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { +entry: + %cmp = icmp sgt i32 %x, 0, !dbg !57 + br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 + +while.cond2.preheader: ; preds = %entry + %cmp313 = icmp slt i32 %x, 0, !dbg !60 + br i1 %cmp313, label %while.body4, label %if.end, !dbg !63 + +while.body: ; preds = %while.body, %entry + %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ] + %tmp = load volatile i32, i32* @factor, align 4, !dbg !64 + %call = tail call i32 @_Z5funcAi(i32 %tmp), !dbg !67 + %sub = sub nsw i32 %x.addr.016, %call, !dbg !68 + %cmp1 = icmp sgt i32 %sub, 0, !dbg !69 + br i1 %cmp1, label %while.body, label %if.end, !dbg !71 + +while.body4: ; preds = %while.body4, %while.cond2.preheader + %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ] + %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72 + %call5 = tail call i32 @_Z5funcBi(i32 %tmp1), !dbg !74 + %add = add nsw i32 %call5, %x.addr.114, !dbg !75 + %cmp3 = icmp slt i32 %add, 0, !dbg !60 + br i1 %cmp3, label %while.body4, label %if.end, !dbg !63 + +if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader + %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ] + ret i32 %x.addr.2, !dbg !76 +} + +define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { +entry: + %sub = add nsw i32 %x, -100000, !dbg !51 + %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 + ret i32 %call, !dbg !53 +} + +define dso_local i32 @_Z3fibi(i32 %x) local_unnamed_addr #1 !dbg !77 { +entry: + %sub = add nsw i32 %x, -100000, !dbg !78 + %call = tail call i32 @_Z3foo(i32 %sub), !dbg !78 + ret i32 %sub, !dbg !78 +} + +declare i32 @_Z3foo(i32) + +attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } +attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!14, !15, !16} +!llvm.ident = !{!17} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo") +!4 = !{} +!5 = !{!6, !10, !11} +!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!7 = !DISubroutineType(types: !8) +!8 = !{!9, !9} +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!12 = !{!0} +!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9) +!14 = !{i32 7, !"Dwarf Version", i32 4} +!15 = !{i32 2, !"Debug Info Version", i32 3} +!16 = !{i32 1, !"wchar_size", i32 4} +!17 = !{!"clang version 11.0.0"} +!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21) +!19 = !DISubroutineType(types: !20) +!20 = !{!9} +!21 = !{!22, !23} +!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9) +!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9) +!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3) +!25 = !DILocation(line: 13, column: 3, scope: !26) +!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2) +!27 = !DILocation(line: 17, column: 3, scope: !18) +!28 = !DILocation(line: 14, column: 10, scope: !29) +!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37) +!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3) +!31 = !DILocation(line: 14, column: 29, scope: !29) +!32 = !DILocation(line: 14, column: 21, scope: !33) +!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2) +!34 = !DILocation(line: 14, column: 19, scope: !29) +!35 = !DILocation(line: 14, column: 7, scope: !29) +!36 = !DILocation(line: 13, column: 33, scope: !37) +!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6) +!38 = !DILocation(line: 13, column: 26, scope: !39) +!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2) +!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!44 = !DILocation(line: 26, column: 22, scope: !40) +!45 = !DILocation(line: 28, column: 11, scope: !40) +!46 = !DILocation(line: 27, column: 3, scope: !40) +!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!51 = !DILocation(line: 33, column: 22, scope: !47) +!52 = !DILocation(line: 33, column: 11, scope: !47) +!53 = !DILocation(line: 35, column: 3, scope: !47) +!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!57 = !DILocation(line: 49, column: 9, scope: !58) +!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7) +!59 = !DILocation(line: 49, column: 7, scope: !54) +!60 = !DILocation(line: 58, column: 14, scope: !61) +!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2) +!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8) +!63 = !DILocation(line: 58, column: 5, scope: !61) +!64 = !DILocation(line: 52, column: 16, scope: !65) +!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19) +!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14) +!67 = !DILocation(line: 52, column: 12, scope: !65) +!68 = !DILocation(line: 52, column: 9, scope: !65) +!69 = !DILocation(line: 51, column: 14, scope: !70) +!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2) +!71 = !DILocation(line: 51, column: 5, scope: !70) +!72 = !DILocation(line: 59, column: 16, scope: !73) +!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19) +!74 = !DILocation(line: 59, column: 12, scope: !73) +!75 = !DILocation(line: 59, column: 9, scope: !73) +!76 = !DILocation(line: 63, column: 3, scope: !54) +!77 = distinct !DISubprogram(name: "funcB", linkageName: "_Z3fibi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!78 = !DILocation(line: 33, column: 22, scope: !77) diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll @@ -0,0 +1,179 @@ +;; Test for different function processing orders affecting inlining in sample profile loader. + +;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program. +;; With -use-profile-top-down-order=0, the top-down processing order of +;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on +;; the static call graph. With -use-profile-top-down-order=1, call edges +;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi). +;; While _Z8funcLeafi is not supposed to be inlined, the outlined entry counts +;; are affected. +; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC +; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC + + +; STATIC: define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]] +; STATIC: ![[#PROF]] = !{!"function_entry_count", i64 21} +; DYNAMIC: define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]] +; DYNAMIC: ![[#PROF]] = !{!"function_entry_count", i64 27} + +@factor = dso_local global i32 3, align 4, !dbg !0 +@fp = dso_local global i32 (i32)* null, align 8 + +define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { +entry: + store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25 + br label %for.body, !dbg !25 + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3, !dbg !27 + +for.body: ; preds = %for.body, %entry + %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] + %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 + %add = add nuw nsw i32 %x.011, 1, !dbg !31 + %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 + %add2 = add i32 %call, %r.010, !dbg !34 + %add3 = add i32 %add2, %call1, !dbg !35 + %dec = add nsw i32 %x.011, -1, !dbg !36 + %cmp = icmp eq i32 %x.011, 0, !dbg !38 + br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25 +} + +define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 { +entry: + %add = add nsw i32 %x, 100000, !dbg !44 + %0 = load i32 (i32)*, i32 (i32)** @fp, align 8 + %call = call i32 %0(i32 8), !dbg !45 + %call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46 + ret i32 %call, !dbg !46 +} + +; INLINE: define dso_local i32 @_Z8funcLeafi +; NOINLINE: define dso_local i32 @_Z8funcLeafi +; ICALL-INLINE: define dso_local i32 @_Z8funcLeafi +; ICALL-NOINLINE: define dso_local i32 @_Z8funcLeafi +define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { +entry: + %cmp = icmp sgt i32 %x, 0, !dbg !57 + br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 + +while.cond2.preheader: ; preds = %entry + %cmp313 = icmp slt i32 %x, 0, !dbg !60 + br i1 %cmp313, label %while.body4, label %if.end, !dbg !63 + +while.body: ; preds = %while.body, %entry + %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ] + %tmp = load volatile i32, i32* @factor, align 4, !dbg !64 + %call = tail call i32 @_Z5funcAi(i32 %tmp), !dbg !67 + %sub = sub nsw i32 %x.addr.016, %call, !dbg !68 + %cmp1 = icmp sgt i32 %sub, 0, !dbg !69 + br i1 %cmp1, label %while.body, label %if.end, !dbg !71 + +while.body4: ; preds = %while.body4, %while.cond2.preheader + %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ] + %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72 + %call5 = tail call i32 @_Z5funcBi(i32 %tmp1), !dbg !74 + %add = add nsw i32 %call5, %x.addr.114, !dbg !75 + %cmp3 = icmp slt i32 %add, 0, !dbg !60 + br i1 %cmp3, label %while.body4, label %if.end, !dbg !63 + +if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader + %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ] + ret i32 %x.addr.2, !dbg !76 +} + +define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { +entry: + %sub = add nsw i32 %x, -100000, !dbg !51 + %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 + ret i32 %call, !dbg !53 +} + +define dso_local i32 @_Z3fibi(i32 %x) local_unnamed_addr #1 !dbg !77 { +entry: + %sub = add nsw i32 %x, -100000, !dbg !78 + %call = tail call i32 @_Z3foo(i32 %sub), !dbg !78 + ret i32 %sub, !dbg !78 +} + +declare i32 @_Z3foo(i32) + +attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } +attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!14, !15, !16} +!llvm.ident = !{!17} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo") +!4 = !{} +!5 = !{!6, !10, !11} +!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!7 = !DISubroutineType(types: !8) +!8 = !{!9, !9} +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!12 = !{!0} +!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9) +!14 = !{i32 7, !"Dwarf Version", i32 4} +!15 = !{i32 2, !"Debug Info Version", i32 3} +!16 = !{i32 1, !"wchar_size", i32 4} +!17 = !{!"clang version 11.0.0"} +!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21) +!19 = !DISubroutineType(types: !20) +!20 = !{!9} +!21 = !{!22, !23} +!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9) +!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9) +!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3) +!25 = !DILocation(line: 13, column: 3, scope: !26) +!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2) +!27 = !DILocation(line: 17, column: 3, scope: !18) +!28 = !DILocation(line: 14, column: 10, scope: !29) +!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37) +!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3) +!31 = !DILocation(line: 14, column: 29, scope: !29) +!32 = !DILocation(line: 14, column: 21, scope: !33) +!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2) +!34 = !DILocation(line: 14, column: 19, scope: !29) +!35 = !DILocation(line: 14, column: 7, scope: !29) +!36 = !DILocation(line: 13, column: 33, scope: !37) +!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6) +!38 = !DILocation(line: 13, column: 26, scope: !39) +!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2) +!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!44 = !DILocation(line: 26, column: 22, scope: !40) +!45 = !DILocation(line: 28, column: 11, scope: !40) +!46 = !DILocation(line: 27, column: 3, scope: !40) +!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!51 = !DILocation(line: 33, column: 22, scope: !47) +!52 = !DILocation(line: 33, column: 11, scope: !47) +!53 = !DILocation(line: 35, column: 3, scope: !47) +!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!57 = !DILocation(line: 49, column: 9, scope: !58) +!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7) +!59 = !DILocation(line: 49, column: 7, scope: !54) +!60 = !DILocation(line: 58, column: 14, scope: !61) +!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2) +!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8) +!63 = !DILocation(line: 58, column: 5, scope: !61) +!64 = !DILocation(line: 52, column: 16, scope: !65) +!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19) +!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14) +!67 = !DILocation(line: 52, column: 12, scope: !65) +!68 = !DILocation(line: 52, column: 9, scope: !65) +!69 = !DILocation(line: 51, column: 14, scope: !70) +!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2) +!71 = !DILocation(line: 51, column: 5, scope: !70) +!72 = !DILocation(line: 59, column: 16, scope: !73) +!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19) +!74 = !DILocation(line: 59, column: 12, scope: !73) +!75 = !DILocation(line: 59, column: 9, scope: !73) +!76 = !DILocation(line: 63, column: 3, scope: !54) +!77 = distinct !DISubprogram(name: "funcB", linkageName: "_Z3fibi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!78 = !DILocation(line: 33, column: 22, scope: !77)