diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -988,12 +988,22 @@ return {}; } + CallsitesTy &mutableCallsites() { + assert(Callsites); + return *Callsites; + } + ArrayRef allocs() const { if (Allocs) return *Allocs; return {}; } + AllocsTy &mutableAllocs() { + assert(Allocs); + return *Allocs; + } + friend struct GraphTraits; }; diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -19,9 +19,12 @@ #include "llvm/ADT/StringSet.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/PassManager.h" +#include namespace llvm { +class GlobalValueSummary; class Module; +class ModuleSummaryIndex; class MemProfContextDisambiguation : public PassInfoMixin { @@ -32,6 +35,10 @@ MemProfContextDisambiguation() {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + void run(ModuleSummaryIndex &Index, + function_ref + isPrevailing); }; } // end namespace llvm diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -51,6 +51,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" @@ -75,6 +76,9 @@ cl::desc("Enable global value internalization in LTO")); } +/// Enable MemProf context disambiguation for thin link. +extern cl::opt EnableMemProfContextDisambiguation; + // Computes a unique hash for the Module considering the current list of // export/import and other global analysis results. // The hash is produced in \p Key. @@ -1539,6 +1543,14 @@ runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs, LocalWPDTargetsMap); + auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { + return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); + }; + if (EnableMemProfContextDisambiguation) { + MemProfContextDisambiguation ContextDisambiguation; + ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing); + } + if (Conf.OptLevel > 0) ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, ImportLists, ExportLists); @@ -1580,10 +1592,6 @@ updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported, LocalWPDTargetsMap); - auto isPrevailing = [&](GlobalValue::GUID GUID, - const GlobalValueSummary *S) { - return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); - }; thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported, isPrevailing); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -14,9 +14,9 @@ // subsequently annotated with an attribute for later transformation. // // The transformations can be performed either directly on IR (regular LTO), or -// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO -// backend). Both types of LTO operate on a the same base graph representation, -// which uses CRTP to support either IR or Index formats. +// on a ThinLTO index (and later applied to the IR during the ThinLTO backend). +// Both types of LTO operate on a the same base graph representation, which +// uses CRTP to support either IR or Index formats. // //===----------------------------------------------------------------------===// @@ -28,9 +28,11 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/MemoryProfileInfo.h" +#include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -458,6 +460,56 @@ const Module &Mod; }; +/// Represents a call in the summary index graph, which can either be an +/// allocation or an interior callsite node in an allocation's context. +/// Holds a pointer to the corresponding data structure in the index. +struct IndexCall : public PointerUnion { + IndexCall() : PointerUnion() {} + IndexCall(std::nullptr_t) : IndexCall() {} + IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} + IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} + + IndexCall *operator->() { return this; } + + void print(raw_ostream &OS) const { + if (auto *AI = dyn_cast()) + OS << *AI; + else { + auto *CI = dyn_cast(); + assert(CI); + OS << *CI; + } + } +}; + +/// CRTP derived class for graphs built from summary index (ThinLTO). +class IndexCallsiteContextGraph + : public CallsiteContextGraph { +public: + IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing); + +private: + friend CallsiteContextGraph; + + uint64_t getStackId(uint64_t IdOrIndex) const; + bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); + uint64_t getLastStackId(IndexCall &Call); + std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, + unsigned CloneNo) const; + + // Saves mapping from function summaries containing memprof records back to + // its VI, for use in checking and debugging. + std::map FSToVIMap; + + const ModuleSummaryIndex &Index; +}; + namespace { struct FieldSeparator { @@ -475,6 +527,20 @@ return OS << FS.Sep; } +// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc +// type we should actually use on the corresponding allocation. +// If we can't clone a node that has NotCold+Cold alloc type, we will fall +// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold +// from NotCold. +AllocationType allocTypeToUse(uint8_t AllocTypes) { + assert(AllocTypes != (uint8_t)AllocationType::None); + if (AllocTypes == + ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) + return AllocationType::NotCold; + else + return (AllocationType)AllocTypes; +} + } // end anonymous namespace template @@ -1118,6 +1184,20 @@ return CallsiteContext.back(); } +uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + // Need to convert index into stack id. + return Index.getStackIdAtIndex(CallsiteContext.back()); +} + +static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { + if (!CloneNo) + return Base.str(); + return (Base + ".memprof." + Twine(CloneNo)).str(); +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -1126,6 +1206,22 @@ .str(); } +std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, + const IndexCall &Call, + unsigned CloneNo) const { + auto VI = FSToVIMap.find(Func); + assert(VI != FSToVIMap.end()); + if (Call.is()) + return (VI->second.name() + " -> alloc").str(); + else { + auto *Callsite = Call.dyn_cast(); + return (VI->second.name() + " -> " + + getMemProfFuncName(Callsite->Callee.name(), + Callsite->Clones[CloneNo])) + .str(); + } +} + std::vector ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( Instruction *Call) { @@ -1135,6 +1231,16 @@ CallsiteContext); } +std::vector +IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + return getStackIdsWithContextNodes::const_iterator>( + CallsiteContext); +} + template template std::vector @@ -1207,6 +1313,84 @@ Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); } +IndexCallsiteContextGraph::IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) + : Index(Index) { + for (auto &I : Index) { + auto VI = Index.getValueInfo(I); + for (auto &S : VI.getSummaryList()) { + // We should only add the prevailing nodes. Otherwise we may try to clone + // in a weak copy that won't be linked (and may be different than the + // prevailing version). + // We only keep the memprof summary on the prevailing copy now when + // building the combined index, as a space optimization, however don't + // rely on this optimization. The linker doesn't resolve local linkage + // values so don't check whether those are prevailing. + if (!GlobalValue::isLocalLinkage(S->linkage()) && + !isPrevailing(VI.getGUID(), S.get())) + continue; + auto *FS = dyn_cast(S.get()); + if (!FS) + continue; + std::vector CallsWithMetadata; + if (!FS->allocs().empty()) { + for (auto &AN : FS->mutableAllocs()) { + // This can happen because of recursion elimination handling that + // currently exists in ModuleSummaryAnalysis. Skip these for now. + // We still added them to the summary because we need to be able to + // correlate properly in applyImport in the backends. + if (AN.MIBs.empty()) + continue; + CallsWithMetadata.push_back({&AN}); + auto *AllocNode = addAllocNode({&AN}, FS); + // Pass an empty CallStack to the CallsiteContext (second) + // parameter, since for ThinLTO we already collapsed out the inlined + // stack ids on the allocation call during ModuleSummaryAnalysis. + CallStack::const_iterator> + EmptyContext; + // Now add all of the MIBs and their stack nodes. + for (auto &MIB : AN.MIBs) { + CallStack::const_iterator> + StackContext(&MIB); + addStackNodesForMIB::const_iterator>( + AllocNode, StackContext, EmptyContext, MIB.AllocType); + } + assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); + // Initialize version 0 on the summary alloc node to the current alloc + // type, unless it has both types in which case make it default, so + // that in the case where we aren't able to clone the original version + // always ends up with the default allocation behavior. + AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); + } + } + // For callsite metadata, add to list for this function for later use. + if (!FS->callsites().empty()) + for (auto &SN : FS->mutableCallsites()) + CallsWithMetadata.push_back({&SN}); + + if (!CallsWithMetadata.empty()) + FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata}); + + if (!FS->allocs().empty() || !FS->callsites().empty()) + FSToVIMap[FS] = VI; + } + } + + if (DumpCCG) { + dbgs() << "CCG before updating call stack chains:\n"; + dbgs() << *this; + } + + if (ExportToDot) + exportToDot("prestackupdate"); + + updateStackNodes(); + + handleCallsitesWithMultipleTargets(); +} + template void CallsiteContextGraph::handleCallsitesWithMultipleTargets() { @@ -1251,6 +1435,12 @@ return IdOrIndex; } +uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { + // In the Index case this is an index into the stack id list in the summary + // index, convert it to an Id. + return Index.getStackIdAtIndex(IdOrIndex); +} + bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call, const Function *Func) { auto *CB = dyn_cast(Call); @@ -1264,6 +1454,23 @@ return Alias && Alias->getAliasee() == Func; } +bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call, + const FunctionSummary *Func) { + ValueInfo Callee = Call.dyn_cast()->Callee; + // If there is no summary list then this is a call to an externally defined + // symbol. + AliasSummary *Alias = + Callee.getSummaryList().empty() + ? nullptr + : dyn_cast(Callee.getSummaryList()[0].get()); + assert(FSToVIMap.count(Func)); + return Callee == FSToVIMap[Func] || + // If callee is an alias, check the aliasee, since only function + // summary base objects will contain the stack node summaries and thus + // get a context node. + (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]); +} + static std::string getAllocTypeString(uint8_t AllocTypes) { if (!AllocTypes) return "None"; @@ -1581,3 +1788,11 @@ return PreservedAnalyses::all(); return PreservedAnalyses::none(); } + +void MemProfContextDisambiguation::run( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) { + IndexCallsiteContextGraph CCG(Index, isPrevailing); + CCG.process(); +} diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -0,0 +1,211 @@ +;; Test callsite context graph generation for simple call graph with +;; two memprof contexts and no inlining. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'memprof-basic.ll' +source_filename = "memprof-basic.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #0 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !7 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !8 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #6 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #6 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #2 + +declare i32 @sleep(i32 noundef) #3 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barv() #4 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !9, !callsite !14 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3bazv() #4 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #4 { +entry: + %call = call noundef ptr @_Z3bazv(), !callsite !16 + ret ptr %call +} + +attributes #0 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #2 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin nounwind } +attributes #7 = { builtin allocsize(0) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{i64 8632435727821051414} +!8 = !{i64 -3421689549917153178} +!9 = !{!10, !12} +!10 = !{!11, !"notcold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!12 = !{!13, !"cold"} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!14 = !{i64 9086428284934609951} +!15 = !{i64 -5964873800580613432} +!16 = !{i64 2732490490862098848} + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2, 3, 0 +; DUMP: AllocType 2 StackIds: 2, 3, 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: Callee: 9832687305761716512 (_Z3barv) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: Callee: 5878270615442837395 (_Z3bazv) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"]; +; DOT: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -0,0 +1,311 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *D() { +;; return new char[10]; +;; } +;; +;; char *F() { +;; return D(); +;; } +;; +;; char *C() { +;; return D(); +;; } +;; +;; char *B() { +;; return C(); +;; } +;; +;; char *E() { +;; return C(); +;; } +;; int main(int argc, char **argv) { +;; char *x = B(); // cold +;; char *y = E(); // cold +;; char *z = F(); // default +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; delete[] z; +;; sleep(10); +;; delete[] x; +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of C into both B and E. +;; Since both allocation contexts via C are cold, the matched memprof +;; metadata has the context pruned above C's callsite. This requires +;; matching the stack node for C to callsites where it was inlined (i.e. +;; the callsites in B and E that have callsite metadata that includes C's). +;; It also requires duplication of that node in the graph as well as the +;; duplication of the context ids along that path through the graph, +;; so that we can represent the duplicated (via inlining) C callsite. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST + +; ModuleID = 'duplicate-context-ids.ll' +source_filename = "duplicate-context-ids.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Dv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Fv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !13 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Cv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Bv() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Ev() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !16 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %z = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z1Bv(), !callsite !17 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z1Ev(), !callsite !18 + store ptr %call1, ptr %y, align 8 + %call2 = call noundef ptr @_Z1Fv(), !callsite !19 + store ptr %call2, ptr %z, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %z, align 8 + %isnull = icmp eq ptr %3, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %3) #7 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call3 = call i32 @sleep(i32 noundef 10) + %4 = load ptr, ptr %x, align 8 + %isnull4 = icmp eq ptr %4, null + br i1 %isnull4, label %delete.end6, label %delete.notnull5 + +delete.notnull5: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %4) #7 + br label %delete.end6 + +delete.end6: ; preds = %delete.notnull5, %delete.end + %5 = load ptr, ptr %y, align 8 + %isnull7 = icmp eq ptr %5, null + br i1 %isnull7, label %delete.end9, label %delete.notnull8 + +delete.notnull8: ; preds = %delete.end6 + call void @_ZdaPv(ptr noundef %5) #7 + br label %delete.end9 + +delete.end9: ; preds = %delete.notnull8, %delete.end6 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"cold"} +!9 = !{i64 6541423618768552252, i64 -6270142974039008131} +!10 = !{!11, !"notcold"} +!11 = !{i64 6541423618768552252, i64 -4903163940066524832} +!12 = !{i64 6541423618768552252} +!13 = !{i64 -4903163940066524832} +!14 = !{i64 -6270142974039008131} +!15 = !{i64 -6270142974039008131, i64 -184525619819294889} +!16 = !{i64 -6270142974039008131, i64 1905834578520680781} +!17 = !{i64 8632435727821051414} +!18 = !{i64 -3421689549917153178} +!19 = !{i64 6307901912192269588} + + +;; After adding only the alloc node memprof metadata, we only have 2 contexts. + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 + +; DUMP: Node [[C]] +; DUMP: null Call +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[F]] +; DUMP: null Call +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +;; After updating for callsite metadata, we should have generated context ids 3 and 4, +;; along with 2 new nodes for those callsites. All have the same allocation type +;; behavior as the original C node. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3 +; DUMP: Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 + +; DUMP: Node [[F]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[C2]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[E]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + + +; DOTPRE: digraph "prestackupdate" { +; DOTPRE: label="prestackupdate"; +; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; +; DOTPRE: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"]; +; DOTPRE: Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"]; +; DOTPRE: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"]; +; DOTPRE: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"]; +; DOTPRE: } + + +; DOTPOST:digraph "postbuild" { +; DOTPOST: label="postbuild"; +; DOTPOST: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; +; DOTPOST: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"]; +; DOTPOST: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"]; +; DOTPOST: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"]; +; DOTPOST: Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"]; +; DOTPOST: Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"]; +; DOTPOST: Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOTPOST: Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"]; +; DOTPOST: Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"]; +; DOTPOST:} diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll @@ -0,0 +1,590 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. This test requires more +;; complex duplication due to multiple contexts for different allocations +;; that share some of the same callsite nodes. +;; +;; Original code looks like: +;; +;; char *D(bool Call1) { +;; if (Call1) +;; return new char[10]; +;; else +;; return new char[10]; +;; } +;; +;; char *C(bool Call1) { +;; return D(Call1); +;; } +;; +;; char *B(bool Call1) { +;; if (Call1) +;; return C(true); +;; else +;; return C(false); +;; } +;; +;; char *A(bool Call1) { +;; return B(Call1); +;; } +;; +;; char *A1() { +;; return A(true); +;; } +;; +;; char *A2() { +;; return A(true); +;; } +;; +;; char *A3() { +;; return A(false); +;; } +;; +;; char *A4() { +;; return A(false); +;; } +;; +;; char *E() { +;; return B(true); +;; } +;; +;; char *F() { +;; return B(false); +;; } +;; +;; int main(int argc, char **argv) { +;; char *a1 = A1(); // cold +;; char *a2 = A2(); // cold +;; char *e = E(); // default +;; char *a3 = A3(); // default +;; char *a4 = A4(); // default +;; char *f = F(); // cold +;; memset(a1, 0, 10); +;; memset(a2, 0, 10); +;; memset(e, 0, 10); +;; memset(a3, 0, 10); +;; memset(a4, 0, 10); +;; memset(f, 0, 10); +;; delete[] a3; +;; delete[] a4; +;; delete[] e; +;; sleep(10); +;; delete[] a1; +;; delete[] a2; +;; delete[] f; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of A into its callers, +;; without any other inlining or optimizations. Since both allocation contexts +;; via A for each allocation in D have the same allocation type (cold via +;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second +;; new in D, the contexts for those respective allocations are pruned above A. +;; The allocations via E and F are to ensure we don't prune above B. +;; +;; The matching onto the inlined A[1234]->A sequences will require duplication +;; of the context id assigned to the context from A for each allocation in D. +;; This test ensures that we do this correctly in the presence of callsites +;; shared by the different duplicated context ids (i.e. callsite in C). + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Z1Db,plx \ +; RUN: -r=%t.o,_Z1Cb,plx \ +; RUN: -r=%t.o,_Z1Bb,plx \ +; RUN: -r=%t.o,_Z1Ab,plx \ +; RUN: -r=%t.o,_Z2A1v,plx \ +; RUN: -r=%t.o,_Z2A2v,plx \ +; RUN: -r=%t.o,_Z2A3v,plx \ +; RUN: -r=%t.o,_Z2A4v,plx \ +; RUN: -r=%t.o,_Z1Ev,plx \ +; RUN: -r=%t.o,_Z1Fv,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + + +; ModuleID = 'duplicate-context-ids2.ll' +source_filename = "duplicate-context-ids2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z1Db(i1 noundef zeroext %Call1) #0 { +entry: + %retval = alloca ptr, align 8 + %Call1.addr = alloca i8, align 1 + %frombool = zext i1 %Call1 to i8 + store i8 %frombool, ptr %Call1.addr, align 1 + %0 = load i8, ptr %Call1.addr, align 1 + %tobool = trunc i8 %0 to i1 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12 + store ptr %call, ptr %retval, align 8 + br label %return + +if.else: ; preds = %entry + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !13, !callsite !18 + store ptr %call1, ptr %retval, align 8 + br label %return + +return: ; preds = %if.else, %if.then + %1 = load ptr, ptr %retval, align 8 + ret ptr %1 +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z1Cb(i1 noundef zeroext %Call1) #0 { +entry: + %Call1.addr = alloca i8, align 1 + %frombool = zext i1 %Call1 to i8 + store i8 %frombool, ptr %Call1.addr, align 1 + %0 = load i8, ptr %Call1.addr, align 1 + %tobool = trunc i8 %0 to i1 + %call = call noundef ptr @_Z1Db(i1 noundef zeroext %tobool), !callsite !19 + ret ptr %call +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z1Bb(i1 noundef zeroext %Call1) #0 { +entry: + %retval = alloca ptr, align 8 + %Call1.addr = alloca i8, align 1 + %frombool = zext i1 %Call1 to i8 + store i8 %frombool, ptr %Call1.addr, align 1 + %0 = load i8, ptr %Call1.addr, align 1 + %tobool = trunc i8 %0 to i1 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call noundef ptr @_Z1Cb(i1 noundef zeroext true), !callsite !20 + store ptr %call, ptr %retval, align 8 + br label %return + +if.else: ; preds = %entry + %call1 = call noundef ptr @_Z1Cb(i1 noundef zeroext false), !callsite !21 + store ptr %call1, ptr %retval, align 8 + br label %return + +return: ; preds = %if.else, %if.then + %1 = load ptr, ptr %retval, align 8 + ret ptr %1 +} + +; Function Attrs: mustprogress uwtable +define dso_local noundef ptr @_Z1Ab(i1 noundef zeroext %Call1) #2 { +entry: + %Call1.addr = alloca i8, align 1 + %frombool = zext i1 %Call1 to i8 + store i8 %frombool, ptr %Call1.addr, align 1 + %0 = load i8, ptr %Call1.addr, align 1 + %tobool = trunc i8 %0 to i1 + %call = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool), !callsite !22 + ret ptr %call +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z2A1v() #0 { +entry: + %Call1.addr.i = alloca i8, align 1 + store i8 1, ptr %Call1.addr.i, align 1 + %0 = load i8, ptr %Call1.addr.i, align 1 + %tobool.i = trunc i8 %0 to i1 + %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !23 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z2A2v() #0 { +entry: + %Call1.addr.i = alloca i8, align 1 + store i8 1, ptr %Call1.addr.i, align 1 + %0 = load i8, ptr %Call1.addr.i, align 1 + %tobool.i = trunc i8 %0 to i1 + %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !24 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z2A3v() #0 { +entry: + %Call1.addr.i = alloca i8, align 1 + store i8 0, ptr %Call1.addr.i, align 1 + %0 = load i8, ptr %Call1.addr.i, align 1 + %tobool.i = trunc i8 %0 to i1 + %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !25 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z2A4v() #0 { +entry: + %Call1.addr.i = alloca i8, align 1 + store i8 0, ptr %Call1.addr.i, align 1 + %0 = load i8, ptr %Call1.addr.i, align 1 + %tobool.i = trunc i8 %0 to i1 + %call.i = call noundef ptr @_Z1Bb(i1 noundef zeroext %tobool.i), !callsite !26 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z1Ev() #0 { +entry: + %call = call noundef ptr @_Z1Bb(i1 noundef zeroext true), !callsite !27 + ret ptr %call +} + +; Function Attrs: mustprogress noinline uwtable +define dso_local noundef ptr @_Z1Fv() #0 { +entry: + %call = call noundef ptr @_Z1Bb(i1 noundef zeroext false), !callsite !28 + ret ptr %call +} + +; Function Attrs: mustprogress noinline norecurse uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %a1 = alloca ptr, align 8 + %a2 = alloca ptr, align 8 + %e = alloca ptr, align 8 + %a3 = alloca ptr, align 8 + %a4 = alloca ptr, align 8 + %f = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z2A1v(), !callsite !29 + store ptr %call, ptr %a1, align 8 + %call1 = call noundef ptr @_Z2A2v(), !callsite !30 + store ptr %call1, ptr %a2, align 8 + %call2 = call noundef ptr @_Z1Ev(), !callsite !31 + store ptr %call2, ptr %e, align 8 + %call3 = call noundef ptr @_Z2A3v(), !callsite !32 + store ptr %call3, ptr %a3, align 8 + %call4 = call noundef ptr @_Z2A4v(), !callsite !33 + store ptr %call4, ptr %a4, align 8 + %call5 = call noundef ptr @_Z1Fv(), !callsite !34 + store ptr %call5, ptr %f, align 8 + %0 = load ptr, ptr %a1, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %a2, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %e, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %a3, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %a4, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %f, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %a3, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %a4, align 8 + %isnull6 = icmp eq ptr %7, null + br i1 %isnull6, label %delete.end8, label %delete.notnull7 + +delete.notnull7: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #8 + br label %delete.end8 + +delete.end8: ; preds = %delete.notnull7, %delete.end + %8 = load ptr, ptr %e, align 8 + %isnull9 = icmp eq ptr %8, null + br i1 %isnull9, label %delete.end11, label %delete.notnull10 + +delete.notnull10: ; preds = %delete.end8 + call void @_ZdaPv(ptr noundef %8) #8 + br label %delete.end11 + +delete.end11: ; preds = %delete.notnull10, %delete.end8 + %call12 = call i32 @sleep(i32 noundef 10) + %9 = load ptr, ptr %a1, align 8 + %isnull13 = icmp eq ptr %9, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end11 + call void @_ZdaPv(ptr noundef %9) #8 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end11 + %10 = load ptr, ptr %a2, align 8 + %isnull16 = icmp eq ptr %10, null + br i1 %isnull16, label %delete.end18, label %delete.notnull17 + +delete.notnull17: ; preds = %delete.end15 + call void @_ZdaPv(ptr noundef %10) #8 + br label %delete.end18 + +delete.end18: ; preds = %delete.notnull17, %delete.end15 + %11 = load ptr, ptr %f, align 8 + %isnull19 = icmp eq ptr %11, null + br i1 %isnull19, label %delete.end21, label %delete.notnull20 + +delete.notnull20: ; preds = %delete.end18 + call void @_ZdaPv(ptr noundef %11) #8 + br label %delete.end21 + +delete.end21: ; preds = %delete.notnull20, %delete.end18 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781} +!10 = !{!11, !"cold"} +!11 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978} +!12 = !{i64 4854880825882961848} +!13 = !{!14, !16} +!14 = !{!15, !"notcold"} +!15 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978} +!16 = !{!17, !"cold"} +!17 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832} +!18 = !{i64 -8775068539491628272} +!19 = !{i64 -904694911315397047} +!20 = !{i64 6532298921261778285} +!21 = !{i64 7859682663773658275} +!22 = !{i64 -6528110295079665978} +!23 = !{i64 -6528110295079665978, i64 5747919905719679568} +!24 = !{i64 -6528110295079665978, i64 -5753238080028016843} +!25 = !{i64 -6528110295079665978, i64 1794685869326395337} +!26 = !{i64 -6528110295079665978, i64 5462047985461644151} +!27 = !{i64 1905834578520680781} +!28 = !{i64 -4903163940066524832} +!29 = !{i64 7001427352816737266} +!30 = !{i64 1465248046217061457} +!31 = !{i64 6307901912192269588} +!32 = !{i64 3001228341853999135} +!33 = !{i64 8637468417231829421} +!34 = !{i64 8690657650969109624} + +^0 = module: (path: "", hash: (0, 0, 0, 0, 0)) +^1 = gv: (name: "_Z2A2v", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 8, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (11918633778629885638, 12693505993681534773)))))) ; guid = 745609562354838949 +^2 = gv: (name: "_Z1Fv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (13543580133643026784)))))) ; guid = 882063111217488233 +^3 = gv: (name: "sleep") ; guid = 3188927404885408457 +^4 = gv: (name: "_Z2A3v", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 8, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (11918633778629885638, 1794685869326395337)))))) ; guid = 3426429179322477523 +^5 = gv: (name: "_ZdaPv") ; guid = 8244930240056412646 +^6 = gv: (name: "_Z1Bb", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 15, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^12)), callsites: ((callee: ^12, clones: (0), stackIds: (6532298921261778285)), (callee: ^12, clones: (0), stackIds: (7859682663773658275)))))) ; guid = 9116113196563097487 +^7 = gv: (name: "_Z1Db", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 15, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^15)), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (17542049162394154569, 6532298921261778285, 1905834578520680781)), (type: cold, stackIds: (17542049162394154569, 6532298921261778285, 11918633778629885638)))), (versions: (none), memProf: ((type: notcold, stackIds: (17542049162394154569, 7859682663773658275, 11918633778629885638)), (type: cold, stackIds: (17542049162394154569, 7859682663773658275, 13543580133643026784)))))))) ; guid = 11485875876353461977 +^8 = gv: (name: "_Z1Ab", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 7, calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (11918633778629885638)))))) ; guid = 12715911323781166420 +^9 = gv: (name: "llvm.memset.p0.i64") ; guid = 12767501690323846396 +^10 = gv: (name: "_Z2A4v", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 8, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (11918633778629885638, 5462047985461644151)))))) ; guid = 14330017355801777481 +^11 = gv: (name: "_Z2A1v", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 8, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (11918633778629885638, 5747919905719679568)))))) ; guid = 14604580269381458981 +^12 = gv: (name: "_Z1Cb", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 7, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (17542049162394154569)))))) ; guid = 15062806102884567440 +^13 = gv: (name: "_Z1Ev", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^6)), callsites: ((callee: ^6, clones: (0), stackIds: (1905834578520680781)))))) ; guid = 15303695751864996451 +^14 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 68, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^11), (callee: ^1), (callee: ^13), (callee: ^4), (callee: ^10), (callee: ^2), (callee: ^5), (callee: ^3)), callsites: ((callee: ^11, clones: (0), stackIds: (7001427352816737266)), (callee: ^1, clones: (0), stackIds: (1465248046217061457)), (callee: ^13, clones: (0), stackIds: (6307901912192269588)), (callee: ^4, clones: (0), stackIds: (3001228341853999135)), (callee: ^10, clones: (0), stackIds: (8637468417231829421)), (callee: ^2, clones: (0), stackIds: (8690657650969109624)))))) ; guid = 15822663052811949562 +^15 = gv: (name: "_Znam") ; guid = 18423971256537370017 +^16 = blockcount: 29 + + +;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only +;; match the interesting parts of the pre-update graph here). + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: + +; DUMP: Node [[D1:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 + +; DUMP: Node [[C:0x[a-z0-9]+]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 + +; DUMP: Node [[D2]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 4, 3 +; DUMP: AllocType 2 StackIds: 0, 4, 5 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 + + +;; After updating for callsite metadata, we should have duplicated the context +;; ids coming from node A (2 and 3) 4 times, for the 4 different callers of A, +;; and used those on new nodes for those callers. Note that while in reality +;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4, +;; due to the pruning we have lost this information and thus end up duplicating +;; both of A's contexts to all of the new nodes (which could result in some +;; unnecessary cloning. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D1]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 5 7 9 11 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 + +; DUMP: Node [[C]] +; DUMP: Callee: 11485875876353461977 (_Z1Db) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 + +; DUMP: Node [[B1]] +; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 5 7 9 11 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5 +; DUMP: Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7 +; DUMP: Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9 +; DUMP: Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11 +; DUMP: Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[E]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[D2]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 4, 3 +; DUMP: AllocType 2 StackIds: 0, 4, 5 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 6 8 10 12 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 + +; DUMP: Node [[B2]] +; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 4 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 6 8 10 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8 +; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10 +; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12 +; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3 + +; DUMP: Node [[F]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 5 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[A2]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 5 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5 +; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[A3]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 8 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 7 8 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7 +; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8 +; DUMP: CallerEdges: + +; DUMP: Node [[A1]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 9 10 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9 +; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10 +; DUMP: CallerEdges: + +; DUMP: Node [[A4]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 9 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 11 12 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11 +; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12 +; DUMP: CallerEdges: + +; DUMP: Node [[A]] +; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -0,0 +1,448 @@ +;; Tests callsite context graph generation for call graph containing indirect +;; calls. Currently this should result in conservative behavior, such that the +;; indirect call receives a null call in its graph node, to prevent subsequent +;; cloning. +;; +;; Original code looks like: +;; +;; char *foo() { +;; return new char[10]; +;; } +;; class A { +;; public: +;; virtual char *x() { return foo(); } +;; }; +;; class B : public A { +;; public: +;; char *x() final { return foo(); } +;; }; +;; char *bar(A *a) { +;; return a->x(); +;; } +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; B b; +;; char *z = bar(&b); +;; char *w = bar(&b); +;; A a; +;; char *r = bar(&a); +;; char *s = bar(&a); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; memset(w, 0, 10); +;; memset(r, 0, 10); +;; memset(s, 0, 10); +;; delete[] x; +;; delete[] w; +;; delete[] r; +;; sleep(10); +;; delete[] y; +;; delete[] z; +;; delete[] s; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Compiled without optimization to prevent inlining and devirtualization. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'indirectcall.ll' +source_filename = "indirectcall.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.B = type { %class.A } +%class.A = type { ptr } + +$_ZN1BC2Ev = comdat any + +$_ZN1AC2Ev = comdat any + +$_ZN1A1xEv = comdat any + +$_ZN1B1xEv = comdat any + +$_ZTV1B = comdat any + +$_ZTS1B = comdat any + +$_ZTS1A = comdat any + +$_ZTI1A = comdat any + +$_ZTI1B = comdat any + +$_ZTV1A = comdat any + +@_ZTV1B = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1xEv] }, comdat, align 8, !type !0, !type !1, !type !2, !type !3 +@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr +@_ZTS1B = internal constant [3 x i8] c"1B\00", comdat, align 1 +@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr +@_ZTS1A = internal constant [3 x i8] c"1A\00", comdat, align 1 +@_ZTI1A = internal constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8 +@_ZTI1B = internal constant { ptr, ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8 +@_ZTV1A = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1xEv] }, comdat, align 8, !type !0, !type !1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr, align 8 + store ptr %a, ptr %a.addr, align 8 + %0 = load ptr, ptr %a.addr, align 8 + %vtable = load ptr, ptr %0, align 8 + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %1 = load ptr, ptr %vfn, align 8 + %call = call noundef ptr %1(ptr noundef nonnull align 8 dereferenceable(8) %0), !callsite !11 + ret ptr %call +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #1 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %b = alloca %class.B, align 8 + %z = alloca ptr, align 8 + %w = alloca ptr, align 8 + %a = alloca %class.A, align 8 + %r = alloca ptr, align 8 + %s = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !12 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !13 + store ptr %call1, ptr %y, align 8 + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) #7 + %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !14 + store ptr %call2, ptr %z, align 8 + %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !15 + store ptr %call3, ptr %w, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) #7 + %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !16 + store ptr %call4, ptr %r, align 8 + %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !17 + store ptr %call5, ptr %s, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %w, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %r, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %s, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %w, align 8 + %isnull6 = icmp eq ptr %7, null + br i1 %isnull6, label %delete.end8, label %delete.notnull7 + +delete.notnull7: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #8 + br label %delete.end8 + +delete.end8: ; preds = %delete.notnull7, %delete.end + %8 = load ptr, ptr %r, align 8 + %isnull9 = icmp eq ptr %8, null + br i1 %isnull9, label %delete.end11, label %delete.notnull10 + +delete.notnull10: ; preds = %delete.end8 + call void @_ZdaPv(ptr noundef %8) #8 + br label %delete.end11 + +delete.end11: ; preds = %delete.notnull10, %delete.end8 + %call12 = call i32 @sleep(i32 noundef 10) + %9 = load ptr, ptr %y, align 8 + %isnull13 = icmp eq ptr %9, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end11 + call void @_ZdaPv(ptr noundef %9) #8 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end11 + %10 = load ptr, ptr %z, align 8 + %isnull16 = icmp eq ptr %10, null + br i1 %isnull16, label %delete.end18, label %delete.notnull17 + +delete.notnull17: ; preds = %delete.end15 + call void @_ZdaPv(ptr noundef %10) #8 + br label %delete.end18 + +delete.end18: ; preds = %delete.notnull17, %delete.end15 + %11 = load ptr, ptr %s, align 8 + %isnull19 = icmp eq ptr %11, null + br i1 %isnull19, label %delete.end21, label %delete.notnull20 + +delete.notnull20: ; preds = %delete.end18 + call void @_ZdaPv(ptr noundef %11) #8 + br label %delete.end21 + +delete.end21: ; preds = %delete.notnull20, %delete.end18 + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) #7 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1A1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !18 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1B1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !19 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #9, !memprof !20, !callsite !33 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #6 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { noinline nounwind optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { nounwind } +attributes #8 = { builtin nounwind } +attributes #9 = { builtin allocsize(0) } + +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFPcvE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFPcvE.virtual"} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"PIE Level", i32 2} +!9 = !{i32 7, !"uwtable", i32 2} +!10 = !{i32 7, !"frame-pointer", i32 2} +!11 = !{i64 -4820244510750103755} +!12 = !{i64 8632435727821051414} +!13 = !{i64 -3421689549917153178} +!14 = !{i64 6792096022461663180} +!15 = !{i64 -2709642582978494015} +!16 = !{i64 748269490701775343} +!17 = !{i64 -5747251260480066785} +!18 = !{i64 8256774051149711748} +!19 = !{i64 -4831879094954754638} +!20 = !{!21, !23, !25, !27, !29, !31} +!21 = !{!22, !"notcold"} +!22 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343} +!23 = !{!24, !"cold"} +!24 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785} +!25 = !{!26, !"notcold"} +!26 = !{i64 2732490490862098848, i64 8632435727821051414} +!27 = !{!28, !"cold"} +!28 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180} +!29 = !{!30, !"notcold"} +!30 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015} +!31 = !{!32, !"cold"} +!32 = !{i64 2732490490862098848, i64 -3421689549917153178} +!33 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[FOO:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 6, 8, 4 +; DUMP: AllocType 2 StackIds: 6, 8, 5 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: AllocType 2 StackIds: 7, 8, 2 +; DUMP: AllocType 1 StackIds: 7, 8, 3 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6 + +; DUMP: Node [[AX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DUMP: Node [[BAR]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5 + +; DUMP: Node [[MAIN3]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN4]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[BX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 + +; DUMP: Node [[MAIN5]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN6]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6 +; DUMP: CallerEdges: + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"]; +; DOT: Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"]; +; DOT: Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"]; +; DOT: Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"]; +; DOT: Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"]; +; DOT: Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOT: Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"]; +; DOT: Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"]; +; DOT: Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -0,0 +1,245 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and partial inlining, requiring generation of a new fused node to +;; represent the inlined sequence while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of baz into foo, and +;; bar into baz. Due to the inlining of bar we will initially have two +;; allocation nodes in the graph. This tests that we correctly match +;; foo (with baz inlined) onto the graph nodes first, and generate a new +;; fused node for it. We should then not match baz (with bar inlined) as that +;; is not reached by the MIB contexts (since all calls from main will look +;; like main -> foo(+baz) -> bar after the inlining reflected in this IR). + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'inlined.ll' +source_filename = "inlined.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3barv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3bazv() #0 { +entry: + %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !13 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !15 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !16 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #8 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432} +!14 = !{i64 -5964873800580613432, i64 2732490490862098848} +!15 = !{i64 8632435727821051414} +!16 = !{i64 -3421689549917153178} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: + +; DUMP: Node [[BAZ:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 1, 2 +; DUMP: AllocType 2 StackIds: 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; This is leftover from the MIB on the alloc inlined into baz. It is not +;; matched with any call, since there is no such node in the IR. Due to the +;; null call it will not participate in any context transformations. +; DUMP: Node [[FOO2]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4 + +;; This is the node synthesized for the call to bar in foo that was created +;; by inlining baz into foo. +; DUMP: Node [[FOO]] +; DUMP: Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 + + +; DOT: digraph "postbuild" { +; DOT: label="postbuild"; +; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"]; +; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"]; +; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"]; +; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"]; +; DOT: Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"]; +; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"]; +; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"]; +; DOT: Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"]; +; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"]; +; DOT: Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"]; +; DOT: Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"]; +; DOT: } diff --git a/llvm/test/ThinLTO/X86/memprof-inlined2.ll b/llvm/test/ThinLTO/X86/memprof-inlined2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-inlined2.ll @@ -0,0 +1,179 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and multiple levels of inlining, requiring generation of new +;; fused nodes to represent the inlined sequence while matching callsite +;; nodes onto the graph. In particular this tests the case where a function +;; has inlined a callee containing an inlined callee. +;; +;; Original code looks like: +;; +;; char *bar() __attribute__((noinline)) { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Both foo and baz are inlined into main, at both foo callsites. +;; We should update the graph for new fused nodes for both of those inlined +;; callsites to bar. +;; +;; Note that baz and bar are both dead due to the inlining, but have been left +;; in the input IR to ensure that the MIB call chain is matched to the longer +;; inline sequences from main. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Z3barv,plx \ +; RUN: -r=%t.o,_Z3bazv,plx \ +; RUN: -r=%t.o,_Z3foov,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + + +; ModuleID = 'inlined2.ll' +source_filename = "inlined2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline uwtable +define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 { +entry: + %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12, !heapallocsite !13 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #2 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress norecurse uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #3 { +delete.end5: + %call.i.i = call noundef ptr @_Z3barv(), !callsite !16 + %call.i.i8 = call noundef ptr @_Z3barv(), !callsite !17 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call.i.i8, i8 0, i64 10, i1 false) + call void @_ZdaPv(ptr noundef nonnull %call.i.i) #8 + %call2 = call i32 @sleep(i32 noundef 10) + call void @_ZdaPv(ptr noundef nonnull %call.i.i8) #8 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) local_unnamed_addr #5 + +declare i32 @sleep(i32 noundef) local_unnamed_addr #6 + +attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!14 = !{i64 -5964873800580613432} +!15 = !{i64 -5964873800580613432, i64 2732490490862098848} +!16 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!17 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} + +^0 = module: (path: "", hash: (0, 0, 0, 0, 0)) +^1 = gv: (name: "sleep") ; guid = 3188927404885408457 +^2 = gv: (name: "_ZdaPv") ; guid = 8244930240056412646 +^3 = gv: (name: "_Z3foov", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848)))))) ; guid = 9191153033785521275 +^4 = gv: (name: "llvm.memset.p0.i64") ; guid = 12767501690323846396 +^5 = gv: (name: "_Z3bazv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000 +^6 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 7, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7), (callee: ^2), (callee: ^1)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))) ; guid = 15822663052811949562 +^7 = gv: (name: "_Z3barv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^8)), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772 +^8 = gv: (name: "_Znam") ; guid = 18423971256537370017 +^9 = flags: 8 +^10 = blockcount: 4 + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +;; This is the node synthesized for the first inlined call chain of main->foo->baz +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +;; This is the node synthesized for the second inlined call chain of main->foo->baz +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: