diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -988,12 +988,22 @@ return {}; } + CallsitesTy &mutableCallsites() { + assert(Callsites); + return *Callsites; + } + ArrayRef allocs() const { if (Allocs) return *Allocs; return {}; } + AllocsTy &mutableAllocs() { + assert(Allocs); + return *Allocs; + } + friend struct GraphTraits; }; diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -19,9 +19,12 @@ #include "llvm/ADT/StringSet.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/PassManager.h" +#include namespace llvm { +class GlobalValueSummary; class Module; +class ModuleSummaryIndex; class MemProfContextDisambiguation : public PassInfoMixin { @@ -32,6 +35,10 @@ MemProfContextDisambiguation() {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + void run(ModuleSummaryIndex &Index, + function_ref + isPrevailing); }; } // end namespace llvm diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -51,6 +51,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" @@ -75,6 +76,9 @@ cl::desc("Enable global value internalization in LTO")); } +/// Enable MemProf context disambiguation for thin link. +extern cl::opt EnableMemProfContextDisambiguation; + // Computes a unique hash for the Module considering the current list of // export/import and other global analysis results. // The hash is produced in \p Key. @@ -1539,6 +1543,14 @@ runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs, LocalWPDTargetsMap); + auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { + return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); + }; + if (EnableMemProfContextDisambiguation) { + MemProfContextDisambiguation ContextDisambiguation; + ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing); + } + if (Conf.OptLevel > 0) ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, ImportLists, ExportLists); @@ -1580,10 +1592,6 @@ updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported, LocalWPDTargetsMap); - auto isPrevailing = [&](GlobalValue::GUID GUID, - const GlobalValueSummary *S) { - return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); - }; thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported, isPrevailing); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -14,9 +14,9 @@ // subsequently annotated with an attribute for later transformation. // // The transformations can be performed either directly on IR (regular LTO), or -// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO -// backend). Both types of LTO operate on a the same base graph representation, -// which uses CRTP to support either IR or Index formats. +// on a ThinLTO index (and later applied to the IR during the ThinLTO backend). +// Both types of LTO operate on a the same base graph representation, which +// uses CRTP to support either IR or Index formats. // //===----------------------------------------------------------------------===// @@ -28,9 +28,11 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/MemoryProfileInfo.h" +#include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -458,6 +460,56 @@ const Module &Mod; }; +/// Represents a call in the summary index graph, which can either be an +/// allocation or an interior callsite node in an allocation's context. +/// Holds a pointer to the corresponding data structure in the index. +struct IndexCall : public PointerUnion { + IndexCall() : PointerUnion() {} + IndexCall(std::nullptr_t) : IndexCall() {} + IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} + IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} + + IndexCall *operator->() { return this; } + + void print(raw_ostream &OS) const { + if (auto *AI = dyn_cast()) + OS << *AI; + else { + auto *CI = dyn_cast(); + assert(CI); + OS << *CI; + } + } +}; + +/// CRTP derived class for graphs built from summary index (ThinLTO). +class IndexCallsiteContextGraph + : public CallsiteContextGraph { +public: + IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing); + +private: + friend CallsiteContextGraph; + + uint64_t getStackId(uint64_t IdOrIndex) const; + bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); + uint64_t getLastStackId(IndexCall &Call); + std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, + unsigned CloneNo) const; + + // Saves mapping from function summaries containing memprof records back to + // its VI, for use in checking and debugging. + std::map FSToVIMap; + + const ModuleSummaryIndex &Index; +}; + namespace { struct FieldSeparator { @@ -475,6 +527,20 @@ return OS << FS.Sep; } +// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc +// type we should actually use on the corresponding allocation. +// If we can't clone a node that has NotCold+Cold alloc type, we will fall +// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold +// from NotCold. +AllocationType allocTypeToUse(uint8_t AllocTypes) { + assert(AllocTypes != (uint8_t)AllocationType::None); + if (AllocTypes == + ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) + return AllocationType::NotCold; + else + return (AllocationType)AllocTypes; +} + } // end anonymous namespace template @@ -1118,6 +1184,20 @@ return CallsiteContext.back(); } +uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + // Need to convert index into stack id. + return Index.getStackIdAtIndex(CallsiteContext.back()); +} + +static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { + if (!CloneNo) + return Base.str(); + return (Base + ".memprof." + Twine(CloneNo)).str(); +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -1126,6 +1206,22 @@ .str(); } +std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, + const IndexCall &Call, + unsigned CloneNo) const { + auto VI = FSToVIMap.find(Func); + assert(VI != FSToVIMap.end()); + if (Call.is()) + return (VI->second.name() + " -> alloc").str(); + else { + auto *Callsite = Call.dyn_cast(); + return (VI->second.name() + " -> " + + getMemProfFuncName(Callsite->Callee.name(), + Callsite->Clones[CloneNo])) + .str(); + } +} + std::vector ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( Instruction *Call) { @@ -1135,6 +1231,16 @@ CallsiteContext); } +std::vector +IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + return getStackIdsWithContextNodes::const_iterator>( + CallsiteContext); +} + template template std::vector @@ -1207,6 +1313,84 @@ Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); } +IndexCallsiteContextGraph::IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) + : Index(Index) { + for (auto &I : Index) { + auto VI = Index.getValueInfo(I); + for (auto &S : VI.getSummaryList()) { + // We should only add the prevailing nodes. Otherwise we may try to clone + // in a weak copy that won't be linked (and may be different than the + // prevailing version). + // We only keep the memprof summary on the prevailing copy now when + // building the combined index, as a space optimization, however don't + // rely on this optimization. The linker doesn't resolve local linkage + // values so don't check whether those are prevailing. + if (!GlobalValue::isLocalLinkage(S->linkage()) && + !isPrevailing(VI.getGUID(), S.get())) + continue; + auto *FS = dyn_cast(S.get()); + if (!FS) + continue; + std::vector CallsWithMetadata; + if (!FS->allocs().empty()) { + for (auto &AN : FS->mutableAllocs()) { + // This can happen because of recursion elimination handling that + // currently exists in ModuleSummaryAnalysis. Skip these for now. + // We still added them to the summary because we need to be able to + // correlate properly in applyImport in the backends. + if (AN.MIBs.empty()) + continue; + CallsWithMetadata.push_back({&AN}); + auto *AllocNode = addAllocNode({&AN}, FS); + // Pass an empty CallStack to the CallsiteContext (second) + // parameter, since for ThinLTO we already collapsed out the inlined + // stack ids on the allocation call during ModuleSummaryAnalysis. + CallStack::const_iterator> + EmptyContext; + // Now add all of the MIBs and their stack nodes. + for (auto &MIB : AN.MIBs) { + CallStack::const_iterator> + StackContext(&MIB); + addStackNodesForMIB::const_iterator>( + AllocNode, StackContext, EmptyContext, MIB.AllocType); + } + assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); + // Initialize version 0 on the summary alloc node to the current alloc + // type, unless it has both types in which case make it default, so + // that in the case where we aren't able to clone the original version + // always ends up with the default allocation behavior. + AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); + } + } + // For callsite metadata, add to list for this function for later use. + if (!FS->callsites().empty()) + for (auto &SN : FS->mutableCallsites()) + CallsWithMetadata.push_back({&SN}); + + if (!CallsWithMetadata.empty()) + FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata}); + + if (!FS->allocs().empty() || !FS->callsites().empty()) + FSToVIMap[FS] = VI; + } + } + + if (DumpCCG) { + dbgs() << "CCG before updating call stack chains:\n"; + dbgs() << *this; + } + + if (ExportToDot) + exportToDot("prestackupdate"); + + updateStackNodes(); + + handleCallsitesWithMultipleTargets(); +} + template void CallsiteContextGraph::handleCallsitesWithMultipleTargets() { @@ -1251,6 +1435,12 @@ return IdOrIndex; } +uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { + // In the Index case this is an index into the stack id list in the summary + // index, convert it to an Id. + return Index.getStackIdAtIndex(IdOrIndex); +} + bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call, const Function *Func) { auto *CB = dyn_cast(Call); @@ -1264,6 +1454,23 @@ return Alias && Alias->getAliasee() == Func; } +bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call, + const FunctionSummary *Func) { + ValueInfo Callee = Call.dyn_cast()->Callee; + // If there is no summary list then this is a call to an externally defined + // symbol. + AliasSummary *Alias = + Callee.getSummaryList().empty() + ? nullptr + : dyn_cast(Callee.getSummaryList()[0].get()); + assert(FSToVIMap.count(Func)); + return Callee == FSToVIMap[Func] || + // If callee is an alias, check the aliasee, since only function + // summary base objects will contain the stack node summaries and thus + // get a context node. + (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]); +} + static std::string getAllocTypeString(uint8_t AllocTypes) { if (!AllocTypes) return "None"; @@ -1581,3 +1788,11 @@ return PreservedAnalyses::all(); return PreservedAnalyses::none(); } + +void MemProfContextDisambiguation::run( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) { + IndexCallsiteContextGraph CCG(Index, isPrevailing); + CCG.process(); +} diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -0,0 +1,157 @@ +;; Test callsite context graph generation for simple call graph with +;; two memprof contexts and no inlining. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + + +source_filename = "memprof-basic.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() { +entry: + %call = call ptr @_Z3foov(), !callsite !0 + %call1 = call ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +define internal ptr @_Z3barv() { +entry: + %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z3bazv() { +entry: + %call = call ptr @_Z3barv(), !callsite !8 + ret ptr null +} + +define internal ptr @_Z3foov() { +entry: + %call = call ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5} +!3 = !{!4, !"notcold"} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold"} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2, 3, 0 +; DUMP: AllocType 2 StackIds: 2, 3, 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: Callee: 9832687305761716512 (_Z3barv) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: Callee: 5878270615442837395 (_Z3bazv) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOT: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -0,0 +1,229 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *D() { +;; return new char[10]; +;; } +;; +;; char *F() { +;; return D(); +;; } +;; +;; char *C() { +;; return D(); +;; } +;; +;; char *B() { +;; return C(); +;; } +;; +;; char *E() { +;; return C(); +;; } +;; int main(int argc, char **argv) { +;; char *x = B(); // cold +;; char *y = E(); // cold +;; char *z = F(); // default +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; delete[] z; +;; sleep(10); +;; delete[] x; +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of C into both B and E. +;; Since both allocation contexts via C are cold, the matched memprof +;; metadata has the context pruned above C's callsite. This requires +;; matching the stack node for C to callsites where it was inlined (i.e. +;; the callsites in B and E that have callsite metadata that includes C's). +;; It also requires duplication of that node in the graph as well as the +;; duplication of the context ids along that path through the graph, +;; so that we can represent the duplicated (via inlining) C callsite. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST + + +source_filename = "duplicate-context-ids.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal ptr @_Z1Dv() { +entry: + %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z1Fv() { +entry: + %call = call ptr @_Z1Dv(), !callsite !6 + ret ptr null +} + +define internal ptr @_Z1Cv() { +entry: + %call = call ptr @_Z1Dv(), !callsite !7 + ret ptr null +} + +define internal ptr @_Z1Bv() { +entry: + %call.i = call ptr @_Z1Dv(), !callsite !8 + ret ptr null +} + +define internal ptr @_Z1Ev() { +entry: + %call.i = call ptr @_Z1Dv(), !callsite !9 + ret ptr null +} + +declare i32 @main() + +declare void @_ZdaPv() + +declare i32 @sleep() + +!0 = !{!1, !3} +!1 = !{!2, !"cold"} +!2 = !{i64 6541423618768552252, i64 -6270142974039008131} +!3 = !{!4, !"notcold"} +!4 = !{i64 6541423618768552252, i64 -4903163940066524832} +!5 = !{i64 6541423618768552252} +!6 = !{i64 -4903163940066524832} +!7 = !{i64 -6270142974039008131} +!8 = !{i64 -6270142974039008131, i64 -184525619819294889} +!9 = !{i64 -6270142974039008131, i64 1905834578520680781} + + +;; After adding only the alloc node memprof metadata, we only have 2 contexts. + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 + +; DUMP: Node [[C]] +; DUMP: null Call +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[F]] +; DUMP: null Call +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +;; After updating for callsite metadata, we should have generated context ids 3 and 4, +;; along with 2 new nodes for those callsites. All have the same allocation type +;; behavior as the original C node. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3 +; DUMP: Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 + +; DUMP: Node [[F]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[C2]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[E]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + + +; DOTPRE: digraph "prestackupdate" { +; DOTPRE: label="prestackupdate"; +; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; +; DOTPRE: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"]; +; DOTPRE: Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"]; +; DOTPRE: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"]; +; DOTPRE: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"]; +; DOTPRE: } + + +; DOTPOST:digraph "postbuild" { +; DOTPOST: label="postbuild"; +; DOTPOST: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; +; DOTPOST: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"]; +; DOTPOST: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"]; +; DOTPOST: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"]; +; DOTPOST: Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"]; +; DOTPOST: Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"]; +; DOTPOST: Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOTPOST: Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"]; +; DOTPOST: Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"]; +; DOTPOST:} diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll @@ -0,0 +1,390 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. This test requires more +;; complex duplication due to multiple contexts for different allocations +;; that share some of the same callsite nodes. +;; +;; Original code looks like: +;; +;; char *D(bool Call1) { +;; if (Call1) +;; return new char[10]; +;; else +;; return new char[10]; +;; } +;; +;; char *C(bool Call1) { +;; return D(Call1); +;; } +;; +;; char *B(bool Call1) { +;; if (Call1) +;; return C(true); +;; else +;; return C(false); +;; } +;; +;; char *A(bool Call1) { +;; return B(Call1); +;; } +;; +;; char *A1() { +;; return A(true); +;; } +;; +;; char *A2() { +;; return A(true); +;; } +;; +;; char *A3() { +;; return A(false); +;; } +;; +;; char *A4() { +;; return A(false); +;; } +;; +;; char *E() { +;; return B(true); +;; } +;; +;; char *F() { +;; return B(false); +;; } +;; +;; int main(int argc, char **argv) { +;; char *a1 = A1(); // cold +;; char *a2 = A2(); // cold +;; char *e = E(); // default +;; char *a3 = A3(); // default +;; char *a4 = A4(); // default +;; char *f = F(); // cold +;; memset(a1, 0, 10); +;; memset(a2, 0, 10); +;; memset(e, 0, 10); +;; memset(a3, 0, 10); +;; memset(a4, 0, 10); +;; memset(f, 0, 10); +;; delete[] a3; +;; delete[] a4; +;; delete[] e; +;; sleep(10); +;; delete[] a1; +;; delete[] a2; +;; delete[] f; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of A into its callers, +;; without any other inlining or optimizations. Since both allocation contexts +;; via A for each allocation in D have the same allocation type (cold via +;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second +;; new in D, the contexts for those respective allocations are pruned above A. +;; The allocations via E and F are to ensure we don't prune above B. +;; +;; The matching onto the inlined A[1234]->A sequences will require duplication +;; of the context id assigned to the context from A for each allocation in D. +;; This test ensures that we do this correctly in the presence of callsites +;; shared by the different duplicated context ids (i.e. callsite in C). +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Z1Db,plx \ +; RUN: -r=%t.o,_Z1Cb,plx \ +; RUN: -r=%t.o,_Z1Bb,plx \ +; RUN: -r=%t.o,_Z1Ab,plx \ +; RUN: -r=%t.o,_Z2A1v,plx \ +; RUN: -r=%t.o,_Z2A2v,plx \ +; RUN: -r=%t.o,_Z2A3v,plx \ +; RUN: -r=%t.o,_Z2A4v,plx \ +; RUN: -r=%t.o,_Z1Ev,plx \ +; RUN: -r=%t.o,_Z1Fv,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define ptr @_Z1Db(i1 %Call1) { +entry: + %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 + br label %return + +if.else: ; No predecessors! + %call1 = call ptr @_Znam(i64 0), !memprof !6, !callsite !11 + br label %return + +return: ; preds = %if.else, %entry + ret ptr null +} + +declare ptr @_Znam(i64) + +define ptr @_Z1Cb(i1 %Call1) { +entry: + %call = call ptr @_Z1Db(i1 false), !callsite !12 + ret ptr null +} + +define ptr @_Z1Bb(i1 %Call1) { +entry: + %call = call ptr @_Z1Cb(i1 false), !callsite !13 + br label %return + +if.else: ; No predecessors! + %call1 = call ptr @_Z1Cb(i1 false), !callsite !14 + br label %return + +return: ; preds = %if.else, %entry + ret ptr null +} + +define ptr @_Z1Ab() { +entry: + %call = call ptr @_Z1Bb(i1 false), !callsite !15 + ret ptr null +} + +define ptr @_Z2A1v() { +entry: + %call.i = call ptr @_Z1Bb(i1 false), !callsite !16 + ret ptr null +} + +define ptr @_Z2A2v() { +entry: + %call.i = call ptr @_Z1Bb(i1 false), !callsite !17 + ret ptr null +} + +define ptr @_Z2A3v() { +entry: + %call.i = call ptr @_Z1Bb(i1 false), !callsite !18 + ret ptr null +} + +define ptr @_Z2A4v() { +entry: + %call.i = call ptr @_Z1Bb(i1 false), !callsite !19 + ret ptr null +} + +define ptr @_Z1Ev() { +entry: + %call = call ptr @_Z1Bb(i1 false), !callsite !20 + ret ptr null +} + +define ptr @_Z1Fv() { +entry: + %call = call ptr @_Z1Bb(i1 false), !callsite !21 + ret ptr null +} + +declare i32 @main() + +declare void @_ZdaPv() + +declare i32 @sleep() + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0 } + +!0 = !{!1, !3} +!1 = !{!2, !"notcold"} +!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781} +!3 = !{!4, !"cold"} +!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978} +!5 = !{i64 4854880825882961848} +!6 = !{!7, !9} +!7 = !{!8, !"notcold"} +!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978} +!9 = !{!10, !"cold"} +!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832} +!11 = !{i64 -8775068539491628272} +!12 = !{i64 -904694911315397047} +!13 = !{i64 6532298921261778285} +!14 = !{i64 7859682663773658275} +!15 = !{i64 -6528110295079665978} +!16 = !{i64 -6528110295079665978, i64 5747919905719679568} +!17 = !{i64 -6528110295079665978, i64 -5753238080028016843} +!18 = !{i64 -6528110295079665978, i64 1794685869326395337} +!19 = !{i64 -6528110295079665978, i64 5462047985461644151} +!20 = !{i64 1905834578520680781} +!21 = !{i64 -4903163940066524832} + + +;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only +;; match the interesting parts of the pre-update graph here). + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: + +; DUMP: Node [[D1:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 + +; DUMP: Node [[C:0x[a-z0-9]+]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 + +; DUMP: Node [[D2]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 4, 3 +; DUMP: AllocType 2 StackIds: 0, 4, 5 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 + + +;; After updating for callsite metadata, we should have duplicated the context +;; ids coming from node A (2 and 3) 4 times, for the 4 different callers of A, +;; and used those on new nodes for those callers. Note that while in reality +;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4, +;; due to the pruning we have lost this information and thus end up duplicating +;; both of A's contexts to all of the new nodes (which could result in some +;; unnecessary cloning. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D1]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 5 7 9 11 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 + +; DUMP: Node [[C]] +; DUMP: Callee: 11485875876353461977 (_Z1Db) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 + +; DUMP: Node [[B1]] +; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 5 7 9 11 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5 +; DUMP: Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7 +; DUMP: Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9 +; DUMP: Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11 +; DUMP: Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[E]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[D2]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 4, 3 +; DUMP: AllocType 2 StackIds: 0, 4, 5 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 6 8 10 12 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 + +; DUMP: Node [[B2]] +; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 4 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 6 8 10 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8 +; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10 +; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12 +; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3 + +; DUMP: Node [[F]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 5 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[A2]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 5 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5 +; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[A3]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 8 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 7 8 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7 +; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8 +; DUMP: CallerEdges: + +; DUMP: Node [[A1]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 9 10 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9 +; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10 +; DUMP: CallerEdges: + +; DUMP: Node [[A4]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 9 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 11 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11 +; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12 +; DUMP: CallerEdges: + +; DUMP: Node [[A]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -0,0 +1,266 @@ +;; Tests callsite context graph generation for call graph containing indirect +;; calls. Currently this should result in conservative behavior, such that the +;; indirect call receives a null call in its graph node, to prevent subsequent +;; cloning. +;; +;; Original code looks like: +;; +;; char *foo() { +;; return new char[10]; +;; } +;; class A { +;; public: +;; virtual char *x() { return foo(); } +;; }; +;; class B : public A { +;; public: +;; char *x() final { return foo(); } +;; }; +;; char *bar(A *a) { +;; return a->x(); +;; } +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; B b; +;; char *z = bar(&b); +;; char *w = bar(&b); +;; A a; +;; char *r = bar(&a); +;; char *s = bar(&a); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; memset(w, 0, 10); +;; memset(r, 0, 10); +;; memset(s, 0, 10); +;; delete[] x; +;; delete[] w; +;; delete[] r; +;; sleep(10); +;; delete[] y; +;; delete[] z; +;; delete[] s; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Compiled without optimization to prevent inlining and devirtualization. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + + +source_filename = "indirectcall.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr +@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr + +define internal ptr @_Z3barP1A(ptr %a) { +entry: + ret ptr null +} + +define i32 @main() { +entry: + %call = call ptr @_Z3foov(), !callsite !0 + %call1 = call ptr @_Z3foov(), !callsite !1 + %call2 = call ptr @_Z3barP1A(ptr null), !callsite !2 + %call3 = call ptr @_Z3barP1A(ptr null), !callsite !3 + %call4 = call ptr @_Z3barP1A(ptr null), !callsite !4 + %call5 = call ptr @_Z3barP1A(ptr null), !callsite !5 + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +define internal ptr @_ZN1A1xEv() { +entry: + %call = call ptr @_Z3foov(), !callsite !6 + ret ptr null +} + +define internal ptr @_ZN1B1xEv() { +entry: + %call = call ptr @_Z3foov(), !callsite !7 + ret ptr null +} + +define internal ptr @_Z3foov() { +entry: + %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21 + ret ptr null +} + +declare ptr @_Znam(i64) + +; uselistorder directives +uselistorder ptr @_Z3foov, { 3, 2, 1, 0 } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{i64 6792096022461663180} +!3 = !{i64 -2709642582978494015} +!4 = !{i64 748269490701775343} +!5 = !{i64 -5747251260480066785} +!6 = !{i64 8256774051149711748} +!7 = !{i64 -4831879094954754638} +!8 = !{!9, !11, !13, !15, !17, !19} +!9 = !{!10, !"notcold"} +!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343} +!11 = !{!12, !"cold"} +!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785} +!13 = !{!14, !"notcold"} +!14 = !{i64 2732490490862098848, i64 8632435727821051414} +!15 = !{!16, !"cold"} +!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180} +!17 = !{!18, !"notcold"} +!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015} +!19 = !{!20, !"cold"} +!20 = !{i64 2732490490862098848, i64 -3421689549917153178} +!21 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[FOO:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 6, 8, 4 +; DUMP: AllocType 2 StackIds: 6, 8, 5 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: AllocType 2 StackIds: 7, 8, 2 +; DUMP: AllocType 1 StackIds: 7, 8, 3 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6 + +; DUMP: Node [[AX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DUMP: Node [[BAR]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5 + +; DUMP: Node [[MAIN3]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN4]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[BX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 + +; DUMP: Node [[MAIN5]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN6]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6 +; DUMP: CallerEdges: + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"]; +; DOT: Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"]; +; DOT: Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"]; +; DOT: Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"]; +; DOT: Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"]; +; DOT: Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOT: Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"]; +; DOT: Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -0,0 +1,186 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and partial inlining, requiring generation of a new fused node to +;; represent the inlined sequence while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of baz into foo, and +;; bar into baz. Due to the inlining of bar we will initially have two +;; allocation nodes in the graph. This tests that we correctly match +;; foo (with baz inlined) onto the graph nodes first, and generate a new +;; fused node for it. We should then not match baz (with bar inlined) as that +;; is not reached by the MIB contexts (since all calls from main will look +;; like main -> foo(+baz) -> bar after the inlining reflected in this IR). +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + + +source_filename = "inlined.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal ptr @_Z3barv() { +entry: + %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z3bazv() { +entry: + %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6 + ret ptr null +} + +define internal ptr @_Z3foov() { +entry: + %call.i = call ptr @_Z3barv(), !callsite !7 + ret ptr null +} + +define i32 @main() { +entry: + %call = call ptr @_Z3foov(), !callsite !8 + %call1 = call ptr @_Z3foov(), !callsite !9 + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +!0 = !{!1, !3} +!1 = !{!2, !"notcold"} +!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!5 = !{i64 9086428284934609951} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432} +!7 = !{i64 -5964873800580613432, i64 2732490490862098848} +!8 = !{i64 8632435727821051414} +!9 = !{i64 -3421689549917153178} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: + +; DUMP: Node [[BAZ:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 1, 2 +; DUMP: AllocType 2 StackIds: 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; This is leftover from the MIB on the alloc inlined into baz. It is not +;; matched with any call, since there is no such node in the IR. Due to the +;; null call it will not participate in any context transformations. +; DUMP: Node [[FOO2]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4 + +;; This is the node synthesized for the call to bar in foo that was created +;; by inlining baz into foo. +; DUMP: Node [[FOO]] +; DUMP: Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"]; +; DOT: Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"]; +; DOT: Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-inlined2.ll b/llvm/test/ThinLTO/X86/memprof-inlined2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-inlined2.ll @@ -0,0 +1,124 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and multiple levels of inlining, requiring generation of new +;; fused nodes to represent the inlined sequence while matching callsite +;; nodes onto the graph. In particular this tests the case where a function +;; has inlined a callee containing an inlined callee. +;; +;; Original code looks like: +;; +;; char *bar() __attribute__((noinline)) { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Both foo and baz are inlined into main, at both foo callsites. +;; We should update the graph for new fused nodes for both of those inlined +;; callsites to bar. +;; +;; Note that baz and bar are both dead due to the inlining, but have been left +;; in the input IR to ensure that the MIB call chain is matched to the longer +;; inline sequences from main. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Z3barv,plx \ +; RUN: -r=%t.o,_Z3bazv,plx \ +; RUN: -r=%t.o,_Z3foov,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define ptr @_Z3barv() { +entry: + %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 + ret ptr null +} + +declare ptr @_Znam(i64) + +declare ptr @_Z3bazv() + +declare ptr @_Z3foov() + +define i32 @main() { +delete.end5: + %call.i.i = call ptr @_Z3barv(), !callsite !6 + %call.i.i8 = call ptr @_Z3barv(), !callsite !7 + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +!0 = !{!1, !3} +!1 = !{!2, !"notcold"} +!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!3 = !{!4, !"cold"} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!5 = !{i64 9086428284934609951} +!6 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!7 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +;; This is the node synthesized for the first inlined call chain of main->foo->baz +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +;; This is the node synthesized for the second inlined call chain of main->foo->baz +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: