diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -988,12 +988,22 @@ return {}; } + CallsitesTy &mutableCallsites() { + assert(Callsites); + return *Callsites; + } + ArrayRef allocs() const { if (Allocs) return *Allocs; return {}; } + AllocsTy &mutableAllocs() { + assert(Allocs); + return *Allocs; + } + friend struct GraphTraits; }; diff --git a/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h @@ -0,0 +1,45 @@ +//====- PGHOContextDisambiguation.h - Context Disambiguation ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements support for context disambiguation of allocation calls for profile +// guided heap optimization. See implementation file for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_PGHO_CONTEXT_DISAMBIGUATION_H +#define LLVM_TRANSFORMS_IPO_PGHO_CONTEXT_DISAMBIGUATION_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/PassManager.h" +#include + +namespace llvm { +class GlobalValueSummary; +class Module; +class ModuleSummaryIndex; + +class PGHOContextDisambiguation + : public PassInfoMixin { + /// Run the context disambiguator on \p TheModule, returns true if any changes + /// was made. + bool processModule(Module &M); + +public: + PGHOContextDisambiguation() {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + void run(ModuleSummaryIndex &Index, + function_ref + isPrevailing); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_PGHO_CONTEXT_DISAMBIGUATION_H diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -51,6 +51,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/PGHOContextDisambiguation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" @@ -75,6 +76,9 @@ cl::desc("Enable global value internalization in LTO")); } +/// Enable PGHO context disambiguation for thin link. +extern cl::opt EnablePGHOContextDisambiguation; + // Computes a unique hash for the Module considering the current list of // export/import and other global analysis results. // The hash is produced in \p Key. @@ -1536,6 +1540,14 @@ runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs, LocalWPDTargetsMap); + auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { + return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); + }; + if (EnablePGHOContextDisambiguation) { + PGHOContextDisambiguation ContextDisambiguation; + ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing); + } + if (Conf.OptLevel > 0) ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, ImportLists, ExportLists); @@ -1577,10 +1589,6 @@ updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported, LocalWPDTargetsMap); - auto isPrevailing = [&](GlobalValue::GUID GUID, - const GlobalValueSummary *S) { - return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); - }; thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported, isPrevailing); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -120,6 +120,7 @@ #include "llvm/Transforms/IPO/MergeFunctions.h" #include "llvm/Transforms/IPO/ModuleInliner.h" #include "llvm/Transforms/IPO/OpenMPOpt.h" +#include "llvm/Transforms/IPO/PGHOContextDisambiguation.h" #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/IPO/SampleProfile.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -59,6 +59,7 @@ #include "llvm/Transforms/IPO/MergeFunctions.h" #include "llvm/Transforms/IPO/ModuleInliner.h" #include "llvm/Transforms/IPO/OpenMPOpt.h" +#include "llvm/Transforms/IPO/PGHOContextDisambiguation.h" #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/IPO/SampleProfile.h" @@ -275,6 +276,10 @@ clEnumValN(AttributorRunOption::NONE, "none", "disable attributor runs"))); +cl::opt EnablePGHOContextDisambiguation( + "enable-pgho-context-disambiguation", cl::init(false), cl::Hidden, + cl::ZeroOrMore, cl::desc("Enable PGHO context disambiguation")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -1712,6 +1717,12 @@ InlineContext{ThinOrFullLTOPhase::FullLTOPostLink, InlinePass::CGSCCInliner})); + // Perform context disambiguation after inlining, since that would reduce the + // amount of additional cloning required to distinguish the allocation + // contexts. + if (EnablePGHOContextDisambiguation) + MPM.addPass(PGHOContextDisambiguation()); + // Optimize globals again after we ran the inliner. MPM.addPass(GlobalOptPass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -87,6 +87,7 @@ MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass()) MODULE_PASS("partial-inliner", PartialInlinerPass()) +MODULE_PASS("pgho-context-disambiguation", PGHOContextDisambiguation()) MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion()) MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen()) MODULE_PASS("pgo-instr-use", PGOInstrumentationUse()) diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -32,6 +32,7 @@ OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp + PGHOContextDisambiguation.cpp SampleContextTracker.cpp SampleProfile.cpp SampleProfileProbe.cpp diff --git a/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp @@ -0,0 +1,1774 @@ +//===-- PGHOContextDisambiguation.cpp - Disambiguate contexts -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements support for context disambiguation of allocation +// calls for profile guided heap optimization. Specifically, it uses Memprof +// profiles which indicate context specific allocation behavior (currently +// distinguishing cold vs hot memory allocations). Cloning is performed to +// expose the cold allocation call contexts, and the allocation calls are +// subsequently annotated with an attribute for later transformation. +// +// The transformations can be performed either directly on IR (regular LTO), or +// on a ThinLTO index (and later applied to the IR during the ThinLTO backend). +// Both types of LTO operate on a the same base graph representation, which +// uses CRTP to support either IR or Index formats. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/PGHOContextDisambiguation.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/MemoryProfileInfo.h" +#include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include +#include +using namespace llvm; +using namespace llvm::memprof; + +#define DEBUG_TYPE "pgho-context-disambiguation" + + +static cl::opt DotFilePathPrefix( + "pgho-dot-file-path-prefix", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path prefix of the PGHO dot files.")); + +static cl::opt ExportToDot("pgho-export-to-dot", cl::init(false), + cl::Hidden, + cl::desc("Export graph to dot files.")); + +static cl::opt DumpCCG("pgho-dump-ccg", cl::init(false), cl::Hidden, + cl::desc("Dump CCG to stdout after each stage.")); + +static cl::opt VerifyCCG("pgho-verify-ccg", cl::init(false), cl::Hidden, + cl::desc("Perform verification checks on CCG.")); + +static cl::opt + VerifyNodes("pgho-verify-nodes", cl::init(false), cl::Hidden, + cl::desc("Perform frequent verification checks on nodes.")); + +inline bool hasSingleAllocType(uint8_t AllocTypes) { + switch (AllocTypes) { + case (uint8_t)AllocationType::Cold: + case (uint8_t)AllocationType::NotCold: + return true; + break; + case (uint8_t)AllocationType::None: + assert(false); + break; + default: + return false; + break; + } + llvm_unreachable("invalid alloc type"); +} + +/// CRTP base for graphs built from either IR or ThinLTO summary index. +/// +/// The graph represents the call contexts in all memprof metadata on allocation +/// calls, with nodes for the allocations themselves, as well as for the calls +/// in each context. The graph is initially built from the allocation memprof +/// metadata (or summary) MIBs. It is then updated to match calls with callsite +/// metadata onto the nodes, updating it to reflect any inlining performed on +/// those calls. +/// +/// Each MIB (representing an allocation's call context with allocation +/// behavior) is assigned a unique context id during the graph build. The edges +/// and nodes in the graph are decorated with the context ids they carry. This +/// is used to correctly update the graph when cloning is performed so that we +/// can uniquify the context for a single (possibly cloned) allocation. +template +class CallsiteContextGraph { +public: + CallsiteContextGraph() = default; + CallsiteContextGraph(const CallsiteContextGraph &) = default; + CallsiteContextGraph(CallsiteContextGraph &&) = default; + + /// Main entry point to perform analysis and transformations on graph. + bool process(); + + void dump() const; + void print(raw_ostream &OS) const; + + friend raw_ostream &operator<<(raw_ostream &OS, + const CallsiteContextGraph &CCG) { + CCG.print(OS); + return OS; + } + + void exportToDot(std::string Path) const; + + /// Represents a function clone via FuncTy pointer and clone number pair. + struct FuncInfo : public std::pair { + using Base = std::pair; + FuncInfo(const Base &B) : Base(B) {} + FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} + explicit operator bool() const { return this->first != nullptr; } + FuncTy *func() const { return this->first; } + unsigned cloneNo() const { return this->second; } + }; + + /// Represents a callsite clone via CallTy and clone number pair. + struct CallInfo : public std::pair { + using Base = std::pair; + CallInfo(const Base &B) : Base(B) {} + CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) + : Base(Call, CloneNo) {} + explicit operator bool() const { return (bool)this->first; } + CallTy call() const { return this->first; } + unsigned cloneNo() const { return this->second; } + void setCloneNo(unsigned N) { this->second = N; } + void print(raw_ostream &OS) const { + if (!(bool)*this) { + assert(!cloneNo()); + OS << "null Call"; + return; + } + call()->print(OS); + OS << "\t(clone " << cloneNo() << ")"; + } + void dump() const { + print(dbgs()); + dbgs() << "\n"; + } + friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { + Call.print(OS); + return OS; + } + }; + + struct ContextEdge; + + /// Node in the Callsite Context Graph + struct ContextNode { + // Keep this for now since in the IR case where we have an Instruction* it + // is not as immediately discoverable. Used for printing richer information + // when dumping graph. + bool IsAllocation; + + // Keeps track of when the Call was reset to null because there was + // recursion. + bool Recursive = false; + + // The corresponding allocation or interior call. + CallInfo Call; + + // For alloc nodes this is a unique id assigned when constructed, and for + // callsite stack nodes it is the original stack id when the node is + // constructed from the memprof MIB metadata on the alloc nodes. For any + // clones it will be 0 (it is only used when matching callsite metadata onto + // the stack nodes created when processing the allocation memprof MIBs). + uint64_t OrigStackOrAllocId = 0; + + // This will be formed by ORing together the AllocationType enum values + // for contexts including this node. + uint8_t AllocTypes = 0; + + // Edges to all callees in the profiled call stacks. + // TODO: Should this be a map (from Callee node) for more efficient lookup? + std::vector> CalleeEdges; + + // Edges to all callers in the profiled call stacks. + // TODO: Should this be a map (from Caller node) for more efficient lookup? + std::vector> CallerEdges; + + // The set of IDs for contexts including this node. + DenseSet ContextIds; + + // List of clones of this ContextNode, initially empty. + std::vector Clones; + + // If a clone, points to the original uncloned node. + ContextNode *CloneOf = nullptr; + + ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} + + ContextNode(bool IsAllocation, CallInfo C) + : IsAllocation(IsAllocation), Call(C) {} + + std::unique_ptr clone() { + auto Clone = std::make_unique(IsAllocation, Call); + if (CloneOf) { + CloneOf->Clones.push_back(Clone.get()); + Clone->CloneOf = CloneOf; + } else { + Clones.push_back(Clone.get()); + Clone->CloneOf = this; + } + return Clone; + } + + ContextNode *getOrigNode() { + if (!CloneOf) + return this; + return CloneOf; + } + + void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, + unsigned int ContextId); + + ContextEdge *findEdgeFromCallee(const ContextNode *Callee); + void eraseCalleeEdge(const ContextEdge *Edge); + void eraseCallerEdge(const ContextEdge *Edge); + + void setCall(CallInfo C) { Call = C; } + + bool hasCall() const { return (bool)Call.call(); } + + void printCall(raw_ostream &OS) const { Call.print(OS); } + + void dump() const; + void print(raw_ostream &OS) const; + + friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { + Node.print(OS); + return OS; + } + }; + + /// Edge in the Callsite Context Graph from a ContextNode N to a caller or + /// callee. + struct ContextEdge { + ContextNode *Callee; + ContextNode *Caller; + + // This will be formed by ORing together the AllocationType enum values + // for contexts including this edge. + uint8_t AllocTypes = 0; + + // The set of IDs for contexts including this edge. + DenseSet ContextIds; + + ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, + DenseSet ContextIds) + : Callee(Callee), Caller(Caller), AllocTypes(AllocType), + ContextIds(ContextIds) {} + + DenseSet &getContextIds() { return ContextIds; } + + void dump() const; + void print(raw_ostream &OS) const; + + friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { + Edge.print(OS); + return OS; + } + }; + +protected: + /// Get a list of nodes corresponding to the stack ids in the given callsite + /// context. + template + std::vector + getStackIdsWithContextNodes(CallStack &CallsiteContext); + + /// Adds nodes for the given allocation and any stack ids on its memprof MIB + /// metadata (or summary). + ContextNode *addAllocNode(CallInfo Call); + + /// Adds nodes for the given MIB stack ids. + template + void addStackNodesForMIB(ContextNode *AllocNode, + CallStack StackContext, + CallStack &CallsiteContext, + AllocationType AllocType); + + /// Matches all callsite metadata (or summary) to the nodes created for + /// allocation memprof MIB metadata, synthesizing new nodes to reflect any + /// inlining performed on those callsite instructions. + void updateStackNodes(); + + /// Update graph to conservatively handle any callsite stack nodes that target + /// multiple different callee target functions. + void handleCallsitesWithMultipleTargets(); + + /// Save lists of calls with PGHO metadata in each function, for faster + /// iteration. + std::map> FuncToCallsWithMetadata; + + /// Map from callsite node to the enclosing caller function. + std::map NodeToCallingFunc; + +private: + using EdgeIter = typename std::vector>::iterator; + + /// Duplicates the given set of context ids, returning the new duplicated + /// id set. The new ids are propagated onto the corresponding nodes + /// from FirstNode towards its leaf callees, and from LastNode towards + /// its root callers. + DenseSet + duplicateContextIds(const DenseSet &StackSequenceContextIds, + ContextNode *FirstNode, ContextNode *LastNode); + + /// Connect the NewNode to OrigNode as a new callee if TowardsCallee is true, + /// else as a new caller. + void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, + bool TowardsCallee); + + /// Get the stack id corresponding to the given Id or Index (for IR this will + /// return itself, for a summary index this will return the id recorded in the + /// index for that stack id index value. + uint64_t getStackId(uint64_t IdOrIndex) const { + return static_cast(this)->getStackId(IdOrIndex); + } + + /// Returns true if the given call targets the given function. + bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) { + return static_cast(this)->calleeMatchesFunc(Call, Func); + } + + /// Get a list of nodes corresponding to the stack ids in the given + /// callsite's context. + std::vector getStackIdsWithContextNodesForCall(CallTy Call) { + return static_cast(this)->getStackIdsWithContextNodesForCall( + Call); + } + + /// Get the last stack id in the context for callsite. + uint64_t getLastStackId(CallTy Call) { + return static_cast(this)->getLastStackId(Call); + } + + /// Gets a label to use in the dot graph for the given call clone in the given + /// function. + std::string getLabel(const FuncTy *Func, const CallTy Call, + unsigned CloneNo) const { + return static_cast(this)->getLabel(Func, Call, CloneNo); + } + + /// Helpers to find the node corresponding to the given call or stackid. + ContextNode *getNodeForInst(const CallInfo &C); + ContextNode *getNodeForAlloc(const CallInfo &C); + ContextNode *getNodeForStackId(uint64_t StackId); + + /// Removes the node information recorded for the given call. + void unsetNodeForInst(const CallInfo &C); + + /// Computes the alloc type corresponding to the given context ids, by + /// unioning their recorded alloc types. + uint8_t computeAllocType(DenseSet &ContextIds); + + /// Create a clone of Edge's callee and move Edge to that new callee node, + /// performing the necessary context id and allocation type updates. + /// If callee's caller edge iterator is supplied, it is updated when removing + /// the edge from that list. + ContextNode * + moveEdgeToNewCalleeClone(const std::shared_ptr &Edge, + EdgeIter *CallerEdgeI = nullptr); + + /// Change the callee of Edge to existing callee clone NewCallee, performing + /// the necessary context id and allocation type updates. + /// If callee's caller edge iterator is supplied, it is updated when removing + /// the edge from that list. + void moveEdgeToExistingCalleeClone(const std::shared_ptr &Edge, + ContextNode *NewCallee, + EdgeIter *CallerEdgeI = nullptr, + bool NewClone = false); + + /// Map from each context ID to the AllocationType assigned to that context. + std::map ContextIdToAllocationType; + + /// Identifies the context node created for a stack id when adding the MIB + /// contexts to the graph. This is used to locate the context nodes when + /// trying to assign the corresponding callsites with those stack ids to these + /// nodes. + std::map StackEntryIdToContextNodeMap; + + /// Maps to track the calls to their corresponding nodes in the graph. + std::map AllocationCallToContextNodeMap; + std::map NonAllocationCallToContextNodeMap; + + /// Owner of all ContextNode unique_ptrs. + std::vector> NodeOwner; + + /// Perform sanity checks on graph when requested. + void check() const; + + /// Keeps track of the last unique context id assigned. + unsigned int LastContextId = 0; +}; + +template +using ContextNode = + typename CallsiteContextGraph::ContextNode; +template +using ContextEdge = + typename CallsiteContextGraph::ContextEdge; +template +using FuncInfo = + typename CallsiteContextGraph::FuncInfo; +template +using CallInfo = + typename CallsiteContextGraph::CallInfo; + +/// CRTP derived class for graphs built from IR (regular LTO). +class ModuleCallsiteContextGraph + : public CallsiteContextGraph { +public: + ModuleCallsiteContextGraph(Module &M); + +private: + friend CallsiteContextGraph; + + uint64_t getStackId(uint64_t IdOrIndex) const; + bool calleeMatchesFunc(Instruction *Call, const Function *Func); + uint64_t getLastStackId(Instruction *Call); + std::vector getStackIdsWithContextNodesForCall(Instruction *Call); + std::string getLabel(const Function *Func, const Instruction *Call, + unsigned CloneNo) const; + + const Module &Mod; +}; + +/// Represents a call in the summary index graph, which can either be an +/// allocation or an interior callsite node in an allocation's context. +/// Holds a pointer to the corresponding data structure in the index. +struct IndexCall : public PointerUnion { + IndexCall() : PointerUnion() {} + IndexCall(std::nullptr_t) : IndexCall() {} + IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} + IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} + + IndexCall *operator->() { return this; } + + void print(raw_ostream &OS) const { + if (auto *AI = dyn_cast()) + OS << *AI; + else { + auto *CI = dyn_cast(); + assert(CI); + OS << *CI; + } + } +}; + +/// CRTP derived class for graphs built from summary index (ThinLTO). +class IndexCallsiteContextGraph + : public CallsiteContextGraph { +public: + IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing); + +private: + friend CallsiteContextGraph; + + uint64_t getStackId(uint64_t IdOrIndex) const; + bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); + uint64_t getLastStackId(IndexCall &Call); + std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, + unsigned CloneNo) const; + + // Saves mapping from function summaries containing memprof records back to + // its VI, for use in checking and debugging. + std::map FSToVIMap; + + const ModuleSummaryIndex &Index; +}; + +namespace { + +struct FieldSeparator { + bool Skip = true; + const char *Sep; + + FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} +}; + +raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { + if (FS.Skip) { + FS.Skip = false; + return OS; + } + return OS << FS.Sep; +} + +// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc +// type we should actually use on the corresponding allocation. +// If we can't clone a node that has NotCold+Cold alloc type, we will fall +// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold +// from NotCold. +AllocationType allocTypeToUse(uint8_t AllocTypes) { + assert(AllocTypes != (uint8_t)AllocationType::None); + if (AllocTypes == + ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) + return AllocationType::NotCold; + else + return (AllocationType)AllocTypes; +} + +} // end anonymous namespace + +template +ContextNode * +CallsiteContextGraph::getNodeForInst( + const CallInfo &C) { + ContextNode *Node = getNodeForAlloc(C); + if (Node) + return Node; + + auto NonAllocCallNode = NonAllocationCallToContextNodeMap.find(C); + if (NonAllocCallNode != NonAllocationCallToContextNodeMap.end()) { + return NonAllocCallNode->second; + } + return nullptr; +} + +template +ContextNode * +CallsiteContextGraph::getNodeForAlloc( + const CallInfo &C) { + auto AllocCallNode = AllocationCallToContextNodeMap.find(C); + if (AllocCallNode != AllocationCallToContextNodeMap.end()) { + return AllocCallNode->second; + } + return nullptr; +} + +template +ContextNode * +CallsiteContextGraph::getNodeForStackId( + uint64_t StackId) { + auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); + if (StackEntryNode != StackEntryIdToContextNodeMap.end()) + return StackEntryNode->second; + return nullptr; +} + +template +void CallsiteContextGraph::unsetNodeForInst( + const CallInfo &C) { + AllocationCallToContextNodeMap.erase(C) || + NonAllocationCallToContextNodeMap.erase(C); + assert(!AllocationCallToContextNodeMap.count(C) && + !NonAllocationCallToContextNodeMap.count(C)); +} + +template +void CallsiteContextGraph::ContextNode:: + addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, + unsigned int ContextId) { + for (auto &Edge : CallerEdges) { + if (Edge->Caller == Caller) { + Edge->AllocTypes |= (uint8_t)AllocType; + Edge->getContextIds().insert(ContextId); + return; + } + } + DenseSet ContextIdSet({ContextId}); + std::shared_ptr Edge = std::make_shared( + this, Caller, (uint8_t)AllocType, ContextIdSet); + CallerEdges.push_back(Edge); + Caller->CalleeEdges.push_back(Edge); +} + +template +ContextEdge * +CallsiteContextGraph::ContextNode:: + findEdgeFromCallee(const ContextNode *Callee) { + for (const auto &Edge : CalleeEdges) + if (Edge->Callee == Callee) + return Edge.get(); + return nullptr; +} + +template +void CallsiteContextGraph::ContextNode:: + eraseCalleeEdge(const ContextEdge *Edge) { + auto EI = + std::find_if(CalleeEdges.begin(), CalleeEdges.end(), + [Edge](const std::shared_ptr &CalleeEdge) { + return CalleeEdge.get() == Edge; + }); + assert(EI != CalleeEdges.end()); + CalleeEdges.erase(EI); +} + +template +void CallsiteContextGraph::ContextNode:: + eraseCallerEdge(const ContextEdge *Edge) { + auto EI = + std::find_if(CallerEdges.begin(), CallerEdges.end(), + [Edge](const std::shared_ptr &CallerEdge) { + return CallerEdge.get() == Edge; + }); + assert(EI != CallerEdges.end()); + CallerEdges.erase(EI); +} + +template +uint8_t CallsiteContextGraph::computeAllocType( + DenseSet &ContextIds) { + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + uint8_t AllocType = (uint8_t)AllocationType::None; + for (auto Id : ContextIds) { + AllocType |= (uint8_t)ContextIdToAllocationType[Id]; + // Bail early if alloc type reached both, no further refinement. + if (AllocType == BothTypes) + return AllocType; + } + return AllocType; +} + +template +ContextNode * +CallsiteContextGraph::addAllocNode(CallInfo Call) { + assert(!getNodeForAlloc(Call)); + NodeOwner.push_back( + std::make_unique(/*IsAllocation*/ true, Call)); + ContextNode *AllocNode = NodeOwner.back().get(); + AllocationCallToContextNodeMap[Call] = AllocNode; + // Use LastContextId as a uniq id for MIB allocation nodes. + AllocNode->OrigStackOrAllocId = LastContextId; + // Alloc type should be updated as we add in the MIBs. We should assert + // afterwards that it is not still None. + AllocNode->AllocTypes = (uint8_t)AllocationType::None; + + return AllocNode; +} + +template +template +void CallsiteContextGraph::addStackNodesForMIB( + ContextNode *AllocNode, CallStack StackContext, + CallStack &CallsiteContext, AllocationType AllocType) { + ContextIdToAllocationType[++LastContextId] = AllocType; + + // Update alloc type and context ids for this MIB. + AllocNode->AllocTypes |= (uint8_t)AllocType; + AllocNode->ContextIds.insert(LastContextId); + + // Now add or update nodes for each stack id in alloc's context. + // Later when processing the stack ids on non-alloc callsites we will adjust + // for any inlining in the context. + ContextNode *PrevNode = AllocNode; + // Look for recursion (direct recursion should have been collapsed by + // module summary analysis, here we should just be detecting mutual + // recursion). Mark these nodes so we don't try to clone. + SmallSet StackIdSet; + // Skip any on the allocation call (inlining). + for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); + ContextIter != StackContext.end(); ++ContextIter) { + auto StackId = getStackId(*ContextIter); + ContextNode *StackNode = getNodeForStackId(StackId); + if (!StackNode) { + NodeOwner.push_back( + std::make_unique(/*IsAllocation*/ false)); + StackNode = NodeOwner.back().get(); + StackEntryIdToContextNodeMap[StackId] = StackNode; + StackNode->OrigStackOrAllocId = StackId; + } + auto Ins = StackIdSet.insert(StackId); + if (!Ins.second) + StackNode->Recursive = true; + StackNode->ContextIds.insert(LastContextId); + StackNode->AllocTypes |= (uint8_t)AllocType; + PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); + PrevNode = StackNode; + } +} + +template +DenseSet +CallsiteContextGraph::duplicateContextIds( + const DenseSet &StackSequenceContextIds, ContextNode *FirstNode, + ContextNode *LastNode) { + // Replicate context ids with new ones (at the same position in the context + // id vector). Set up a map entry for the below traversal. + DenseSet NewContextIds; + DenseMap OldToNewContextIds; + for (auto OldId : StackSequenceContextIds) { + NewContextIds.insert(++LastContextId); + OldToNewContextIds[OldId] = LastContextId; + assert(ContextIdToAllocationType.count(OldId)); + ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; + } + + auto Propagate = [&](ContextNode *PrevNode, bool TowardsCallee, + const DenseSet &PrevIds, + DenseSet &Visited, + auto &&Propagate) -> void { + for (auto &Edge : + (TowardsCallee ? PrevNode->CalleeEdges : PrevNode->CallerEdges)) { + DenseSet CurIds = + set_intersection(PrevIds, Edge->getContextIds()); + // Done if no ids found along this edge. + if (CurIds.empty()) + continue; + // Map curids to new ids in edge, and in the caller/callee. Then + // recurse using cur ids. + ContextNode *NextNode = TowardsCallee ? Edge->Callee : Edge->Caller; + auto Inserted = Visited.insert(NextNode); + if (!Inserted.second) + continue; + for (auto Id : CurIds) { + auto NewId = OldToNewContextIds[Id]; + Edge->getContextIds().insert(NewId); + NextNode->ContextIds.insert(NewId); + } + Propagate(NextNode, TowardsCallee, CurIds, Visited, Propagate); + } + }; + + // Walk towards allocation nodes from FirstNode, adding the new context ids in + // the sets containing the old context ids. + DenseSet Visited; + Propagate(FirstNode, /*TowardsCallee*/ true, StackSequenceContextIds, Visited, + Propagate); + + // Walk towards allocation nodes from FirstNode, adding the new context ids in + // the sets containing the old context ids. + Visited.clear(); + Propagate(LastNode, /*TowardsCallee*/ false, StackSequenceContextIds, Visited, + Propagate); + + return NewContextIds; +} + +template +void CallsiteContextGraph::connectNewNode( + ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { + // Make a copy of the context ids, since this will be adjusted below as they + // are moved. + DenseSet RemainingContextIds = NewNode->ContextIds; + auto &OrigEdges = + TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; + // Increment iterator in loop so that we can remove edges as needed. + for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { + auto Edge = *EI; + // Remove any matching context ids from Edge, return set that were found and + // removed, these are the new edge's context ids. Also update the remaining + // (not found ids). + DenseSet NewEdgeContextIds, NotFoundContextIds; + set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, + NotFoundContextIds); + RemainingContextIds.swap(NotFoundContextIds); + // If no matching context ids for this edge, skip it. + if (NewEdgeContextIds.empty()) { + ++EI; + continue; + } + if (TowardsCallee) { + auto NewEdge = std::make_shared( + Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds), + NewEdgeContextIds); + NewNode->CalleeEdges.push_back(NewEdge); + NewEdge->Callee->CallerEdges.push_back(NewEdge); + } else { + auto NewEdge = std::make_shared( + NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds), + NewEdgeContextIds); + NewNode->CallerEdges.push_back(NewEdge); + NewEdge->Caller->CalleeEdges.push_back(NewEdge); + } + // Remove old edge if context ids empty. + if (Edge->getContextIds().empty()) { + if (TowardsCallee) { + Edge->Callee->eraseCallerEdge(Edge.get()); + EI = OrigNode->CalleeEdges.erase(EI); + } else { + Edge->Caller->eraseCalleeEdge(Edge.get()); + EI = OrigNode->CallerEdges.erase(EI); + } + continue; + } + ++EI; + } +} + +template +void CallsiteContextGraph::updateStackNodes() { + // Map of stack id to all calls with that as the last (outermost caller) + // callsite id that has a context node (some might not due to pruning + // performed during matching of the allocation profile contexts). + // The CallContextInfo contains the Call and a list of its stack ids with + // ContextNodes, as well as the function containing Call. + using CallContextInfo = + std::tuple, const FuncTy *>; + DenseMap> StackIdToMatchingCalls; + for (auto &FuncEntry : FuncToCallsWithMetadata) { + auto *Func = FuncEntry.first; + for (auto Call : FuncEntry.second) { + // Ignore allocations, already handled. + if (AllocationCallToContextNodeMap.count(Call)) + continue; + auto StackIdsWithContextNodes = + getStackIdsWithContextNodesForCall(Call.call()); + // If there were no nodes created for MIBs on allocs (maybe this was in + // the unambiguous part of the MIB stack that was pruned), ignore. + if (StackIdsWithContextNodes.empty()) + continue; + // Otherwise, record this Call along with the list of ids for the last + // (outermost caller) stack id with a node. + StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( + {Call.call(), StackIdsWithContextNodes, Func}); + } + } + + // Now perform a post-order traversal over the graph, starting with the + // allocation nodes, essentially processing nodes from callers to callees. + // For any that contains an id in the map, update the graph to contain new + // nodes representing any inlining at interior callsites. Note we move the + // associated context ids over to the new nodes. + auto ProcessNode = [&](ContextNode *Node, + DenseSet &Visited, + auto &&ProcessNode) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + // Post order traversal. Iterate over a copy since we may add nodes and + // therefore new callers during the recursive call, invalidating any + // iterator over the original edge vector. We don't need to process these + // new nodes as they were already processed on creation. + auto CallerEdges = Node->CallerEdges; + for (auto &Edge : CallerEdges) { + // Skip any that have been removed during the recursion. + if (!Edge) + continue; + ProcessNode(Edge->Caller, Visited, ProcessNode); + } + + // Ignore this node if it is for an allocation or we didn't record any + // stack id lists ending at it. + if (Node->IsAllocation || + !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) + return; + + auto Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; + // Handle the simple case first. A single call with a single stack id. + // In this case there is no need to create any new context nodes, simply + // assign the context node for stack id to this Call. + if (Calls.size() == 1) { + auto &[Call, Ids, Func] = Calls[0]; + if (Ids.size() == 1) { + // It should be this Node + assert(Node == getNodeForStackId(Ids[0])); + if (Node->Recursive) + return; + Node->setCall(Call); + NonAllocationCallToContextNodeMap[Call] = Node; + NodeToCallingFunc[Node] = Func; + return; + } + } + // In order to do the best and maximal matching of + // inlined calls to context node sequences we will sort the vectors of stack + // ids in reverse order of length, and within each length, lexicographically + // by stack id. The latter is so that we can specially handle calls that + // have identical stack id sequences (either due to cloning or artificially + // because of the MIB context pruning). + std::sort(Calls.begin(), Calls.end(), + [](const CallContextInfo &A, const CallContextInfo &B) { + auto &[CallA, IdsA, FuncA] = A; + auto &[CallB, IdsB, FuncB] = B; + return IdsA.size() > IdsB.size() || + (IdsA.size() == IdsB.size() && IdsA < IdsB); + }); + for (unsigned I = 0; I < Calls.size(); I++) { + auto &[Call, Ids, Func] = Calls[I]; + // First compute the context ids for this stack id sequence (the + // intersection of the context ids of the corresponding nodes). + DenseSet StackSequenceContextIds; + ContextNode *CurNode = nullptr; + ContextNode *PrevNode = nullptr; + ContextNode *FirstNode = nullptr; + bool Skip = false; + for (auto Id : Ids) { + CurNode = getNodeForStackId(Id); + // We should only have kept stack ids that had nodes. + assert(CurNode); + + if (CurNode->Recursive) { + Skip = true; + break; + } + + // If this is the first node, simply initialize the context ids set. + // We will subsequently refine the context ids by computing the + // intersection along all edges (initializing with the first node's + // context ids handles the case where there is a single id/node). + if (!FirstNode) { + FirstNode = PrevNode = CurNode; + StackSequenceContextIds = CurNode->ContextIds; + continue; + } + + auto *Edge = CurNode->findEdgeFromCallee(PrevNode); + // If there is no edge then the nodes belong to different MIB contexts, + // and we should skip this inlined context sequence. + if (!Edge) { + Skip = true; + break; + } + PrevNode = CurNode; + + // Update the context ids, which is the intersection of the ids along + // all edges in the sequence. + set_intersect(StackSequenceContextIds, Edge->getContextIds()); + + // If we now have no context ids for clone, skip this call. + if (StackSequenceContextIds.empty()) { + Skip = true; + break; + } + } + if (Skip) + continue; + + ContextNode *LastNode = CurNode; + + // If some of this call's stack ids did not have corresponding nodes (due + // to pruning), don't include any context ids for contexts that extend + // beyond these nodes. Otherwise we would be matching part of unrelated / + // not fully matching stack contexts. To do this, subtract any context ids + // found in caller nodes of the last node found above. + if (Ids.back() != getLastStackId(Call)) { + assert(CurNode); + for (auto &PE : CurNode->CallerEdges) { + set_subtract(StackSequenceContextIds, PE->getContextIds()); + if (StackSequenceContextIds.empty()) + break; + } + // If we now have no context ids for clone, skip this call. + if (StackSequenceContextIds.empty()) + continue; + } + + // Check if the next set of stack ids is the same (since the Calls vector + // of tuples is sorted by the stack ids we can just look at the next one). + bool DuplicateStackIds = false; + if (I + 1 < Calls.size()) { + auto NextIds = std::get<1>(Calls[I + 1]); + DuplicateStackIds = Ids == NextIds; + } + + // Create new context node. If we don't have duplicate stack ids, then we + // can move all the context ids computed for the original node sequence + // onto the clone. If there are duplicate calls with the same stack ids + // then we synthesize new context ids that are duplicates of the + // originals. These new context ids need to be propagated to all + // nodes/edges on the same contexts, other than those being subsumed + // by the clone. + NodeOwner.push_back( + std::make_unique(/*IsAllocation*/ false, Call)); + ContextNode *NewNode = NodeOwner.back().get(); + NodeToCallingFunc[NewNode] = Func; + NonAllocationCallToContextNodeMap[Call] = NewNode; + NewNode->ContextIds = DuplicateStackIds + ? duplicateContextIds(StackSequenceContextIds, + FirstNode, LastNode) + : StackSequenceContextIds; + NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); + + // Connect to callees of innermost stack frame in inlined call chain. + connectNewNode(NewNode, FirstNode, /*TowardsCallee*/ true); + + // Connect to callers of outermost stack frame in inlined call chain. + connectNewNode(NewNode, LastNode, /*TowardsCallee*/ false); + + // If we didn't create new context ids for clone, we need to remove those + // moved to the clone from the original nodes. + if (!DuplicateStackIds) { + ContextNode *PrevNode = nullptr; + for (auto Id : Ids) { + ContextNode *CurNode = getNodeForStackId(Id); + // We should only have kept stack ids that had nodes. + assert(CurNode); + + // Remove the context ids moved to NewNode from CurNode, and the + // edge from the prior node. + set_subtract(CurNode->ContextIds, NewNode->ContextIds); + if (PrevNode) { + auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); + assert(PrevEdge); + set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); + if (PrevEdge->getContextIds().empty()) { + PrevNode->eraseCallerEdge(PrevEdge); + CurNode->eraseCalleeEdge(PrevEdge); + } + } + PrevNode = CurNode; + } + } + } + }; + + // Actual post-order traversal. + DenseSet Visited; + for (auto &Entry : AllocationCallToContextNodeMap) + ProcessNode(Entry.second, Visited, ProcessNode); +} + +uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { + CallStack CallsiteContext( + Call->getMetadata(LLVMContext::MD_callsite)); + return CallsiteContext.back(); +} + +uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + // Need to convert index into stack id. + return Index.getStackIdAtIndex(CallsiteContext.back()); +} + +static std::string getPGHOFuncName(Twine Base, unsigned CloneNo) { + if (!CloneNo) + return Base.str(); + return (Base + ".pgho." + std::to_string(CloneNo)).str(); +} + +std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, + const Instruction *Call, + unsigned CloneNo) const { + return (Call->getFunction()->getName() + " -\\> " + + cast(Call)->getCalledFunction()->getName()) + .str(); +} + +std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, + const IndexCall &Call, + unsigned CloneNo) const { + auto VI = FSToVIMap.find(Func); + assert(VI != FSToVIMap.end()); + if (Call.is()) + return (VI->second.name() + " -\\> alloc").str(); + else { + auto *Callsite = Call.dyn_cast(); + return (VI->second.name() + " -\\> " + + getPGHOFuncName(Callsite->Callee.name(), Callsite->Clones[CloneNo])) + .str(); + } +} + +std::vector +ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( + Instruction *Call) { + CallStack CallsiteContext( + Call->getMetadata(LLVMContext::MD_callsite)); + return getStackIdsWithContextNodes( + CallsiteContext); +} + +std::vector +IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { + assert(Call.is()); + CallStack::const_iterator> + CallsiteContext(Call.dyn_cast()); + return getStackIdsWithContextNodes::const_iterator>( + CallsiteContext); +} + +template +template +std::vector +CallsiteContextGraph::getStackIdsWithContextNodes( + CallStack &CallsiteContext) { + std::vector StackIds; + for (auto IdOrIndex : CallsiteContext) { + auto StackId = getStackId(IdOrIndex); + ContextNode *Node = getNodeForStackId(StackId); + if (!Node) + break; + StackIds.push_back(StackId); + } + return StackIds; +} + +ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) { + for (auto &F : M) { + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { + FuncToCallsWithMetadata[&F].push_back(&I); + auto *AllocNode = addAllocNode(&I); + CallStack CallsiteContext( + I.getMetadata(LLVMContext::MD_callsite)); + // Add all of the MIBs and their stack nodes. + for (auto &MDOp : MemProfMD->operands()) { + auto *MIBMD = cast(MDOp); + MDNode *StackNode = getMIBStackNode(MIBMD); + assert(StackNode); + addStackNodesForMIB( + AllocNode, StackNode, CallsiteContext, getMIBAllocType(MIBMD)); + } + assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); + NodeToCallingFunc[AllocNode] = &F; + // Memprof and callsite metadata on memory allocations no longer + // needed. + I.setMetadata(LLVMContext::MD_memprof, nullptr); + I.setMetadata(LLVMContext::MD_callsite, nullptr); + } + // For callsite metadata, add to list for this function for later use. + else if (I.getMetadata(LLVMContext::MD_callsite)) + FuncToCallsWithMetadata[&F].push_back(&I); + } + } + } + + if (DumpCCG) { + dbgs() << "CCG before updating call stack chains:\n"; + dbgs() << *this; + } + + if (ExportToDot) + exportToDot("ccg.prestackupdate.dot"); + + updateStackNodes(); + + // Strip off remaining callsite metadata, no longer needed. + for (auto &FuncEntry : FuncToCallsWithMetadata) + for (auto Call : FuncEntry.second) + Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); + + handleCallsitesWithMultipleTargets(); +} + +IndexCallsiteContextGraph::IndexCallsiteContextGraph( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) + : Index(Index) { + for (auto &I : Index) { + auto VI = Index.getValueInfo(I); + for (auto &S : VI.getSummaryList()) { + // We should only add the prevailing nodes. Otherwise we may try to clone + // in a weak copy that won't be linked (and may be different than the + // prevailing version). + // We only keep the memprof summary on the prevailing copy now when + // building the combined index, as a space optimization, however don't + // rely on this optimization. The linker doesn't resolve local linkage + // values so don't check whether those are prevailing. + if (!GlobalValue::isLocalLinkage(S->linkage()) && + !isPrevailing(VI.getGUID(), S.get())) + continue; + auto *FS = dyn_cast(S.get()); + if (!FS) + continue; + if (!FS->allocs().empty()) { + for (auto &AN : FS->mutableAllocs()) { + // This can happen because of recursion elimination handling that + // currently exists in ModuleSummaryAnalysis. Skip these for now. + // We still added them to the summary because we need to be able to + // correlate properly in applyImport in the backends. + if (AN.MIBs.empty()) + continue; + FuncToCallsWithMetadata[FS].push_back({&AN}); + auto *AllocNode = addAllocNode({&AN}); + // Pass an empty CallStack to the CallsiteContext (second) + // parameter, since for ThinLTO we already collapsed out the inlined + // stack ids on the allocation call during ModuleSummaryAnalysis. + CallStack::const_iterator> + EmptyContext; + // Now add all of the MIBs and their stack nodes. + for (auto &MIB : AN.MIBs) + addStackNodesForMIB::const_iterator>( + AllocNode, &MIB, EmptyContext, MIB.AllocType); + assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); + // Initialize version 0 on the summary alloc node to the current alloc + // type, unless it has both types in which case make it default, so + // that in the case where we aren't able to clone the original version + // always ends up with the default allocation behavior. + AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); + NodeToCallingFunc[AllocNode] = FS; + } + } + // For callsite metadata, add to list for this function for later use. + if (!FS->callsites().empty()) + for (auto &SN : FS->mutableCallsites()) + FuncToCallsWithMetadata[FS].push_back({&SN}); + + if (!FS->allocs().empty() || !FS->callsites().empty()) + FSToVIMap[FS] = VI; + } + } + + if (DumpCCG) { + dbgs() << "CCG before updating call stack chains:\n"; + dbgs() << *this; + } + + if (ExportToDot) + exportToDot("ccg.prestackupdate.dot"); + + updateStackNodes(); + + handleCallsitesWithMultipleTargets(); +} + +template +void CallsiteContextGraph::handleCallsitesWithMultipleTargets() { + // Look for and workaround callsites that call multiple functions. + // This can happen for indirect calls, which needs better handling, and in + // more rare cases (e.g. macro expansion). + // TODO: To fix this for indirect calls we will want to perform speculative + // devirtualization using either the normal PGO info with ICP, or using the + // information in the profiled PGHO contexts. We can do this prior to + // this transformation for regular LTO, and for ThinLTO we can simulate that + // effect in the summary and perform the actual speculative devirtualization + // while cloning in the ThinLTO backend. + for (auto Entry = NonAllocationCallToContextNodeMap.begin(); + Entry != NonAllocationCallToContextNodeMap.end();) { + auto *Node = Entry->second; + assert(Node->Clones.empty()); + // Check all node callees and see if in the same function. + bool Removed = false; + auto Call = Node->Call.call(); + for (auto &Edge : Node->CalleeEdges) { + if (!Edge->Callee->hasCall()) + continue; + assert(NodeToCallingFunc.count(Edge->Callee)); + // Check if the called function matches that of the callee node. + if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee])) + continue; + // Work around by setting Node to have a null call, so it gets + // skipped during cloning. Otherwise assignFunctions will assert + // because its data structures are not designed to handle this case. + Entry = NonAllocationCallToContextNodeMap.erase(Entry); + Node->setCall(CallInfo()); + Removed = true; + break; + } + if (!Removed) + Entry++; + } +} + +uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { + // In the Module (IR) case this is already the Id. + return IdOrIndex; +} + +uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { + // In the Index case this is an index into the stack id list in the summary + // index, convert it to an Id. + return Index.getStackIdAtIndex(IdOrIndex); +} + +bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call, + const Function *Func) { + auto *CB = dyn_cast(Call); + if (!CB->getCalledOperand()) + return false; + auto CalleeVal = CB->getCalledOperand()->stripPointerCasts(); + auto *CalleeFunc = dyn_cast(CalleeVal); + if (CalleeFunc == Func) + return true; + auto *Alias = dyn_cast(CalleeVal); + return Alias && Alias->getAliasee() == Func; +} + +bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call, + const FunctionSummary *Func) { + ValueInfo Callee = Call.dyn_cast()->Callee; + // If there is no summary list then this is a call to an externally defined + // symbol. + AliasSummary *Alias = + Callee.getSummaryList().empty() + ? nullptr + : dyn_cast(Callee.getSummaryList()[0].get()); + assert(FSToVIMap.count(Func)); + return Callee == FSToVIMap[Func] || + // If callee is an alias, check the aliasee, since only function + // summary base objects will contain the stack node summaries and thus + // get a context node. + (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]); +} + +static std::string getAllocTypeString(uint8_t AllocTypes) { + if (!AllocTypes) + return "None"; + std::string Str; + if (AllocTypes & (uint8_t)AllocationType::NotCold) + Str += "NotCold"; + if (AllocTypes & (uint8_t)AllocationType::Cold) + Str += "Cold"; + return Str; +} + +template +void CallsiteContextGraph::ContextNode::dump() + const { + print(dbgs()); + dbgs() << "\n"; +} + +template +void CallsiteContextGraph::ContextNode::print( + raw_ostream &OS) const { + OS << "Node " << this << "\n"; + OS << "\t"; + printCall(OS); + if (Recursive) + OS << " (recursive)"; + OS << "\n"; + OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; + OS << "\tContextIds:"; + std::vector SortedIds(ContextIds.begin(), ContextIds.end()); + std::sort(SortedIds.begin(), SortedIds.end()); + for (auto Id : SortedIds) + OS << " " << Id; + OS << "\n"; + OS << "\tCalleeEdges:\n"; + for (auto &Edge : CalleeEdges) + OS << "\t\t" << *Edge << "\n"; + OS << "\tCallerEdges:\n"; + for (auto &Edge : CallerEdges) + OS << "\t\t" << *Edge << "\n"; + if (!Clones.empty()) { + OS << "\tClones: "; + FieldSeparator FS; + for (auto *Clone : Clones) + OS << FS << Clone; + OS << "\n"; + } else if (CloneOf) { + OS << "\tClone of " << CloneOf << "\n"; + } +} + +template +void CallsiteContextGraph::ContextEdge::dump() + const { + print(dbgs()); + dbgs() << "\n"; +} + +template +void CallsiteContextGraph::ContextEdge::print( + raw_ostream &OS) const { + OS << "Edge from Callee " << Callee << " to Caller: " << Caller + << " AllocTypes: " << getAllocTypeString(AllocTypes); + OS << " ContextIds:"; + std::vector SortedIds(ContextIds.begin(), ContextIds.end()); + std::sort(SortedIds.begin(), SortedIds.end()); + for (auto Id : SortedIds) + OS << " " << Id; +} + +template +static void printNodesRecursively( + raw_ostream &OS, const ContextNode *Node, + DenseSet *> &Visited) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + Node->print(OS); + OS << "\n"; + for (auto &Edge : Node->CallerEdges) + printNodesRecursively(OS, Edge->Caller, + Visited); +} + +template +void CallsiteContextGraph::dump() const { + print(dbgs()); +} + +template +void CallsiteContextGraph::print( + raw_ostream &OS) const { + OS << "Callsite Context Graph:\n"; + DenseSet Visited; + for (auto &Entry : AllocationCallToContextNodeMap) { + printNodesRecursively(OS, Entry.second, + Visited); + for (auto *Clone : Entry.second->Clones) + printNodesRecursively(OS, Clone, Visited); + } +} + +template +static void checkEdge( + const std::shared_ptr> &Edge) { + // Confirm that alloc type is not None and that we have at least one context + // id. + assert(Edge->AllocTypes != (uint8_t)AllocationType::None); + assert(!Edge->ContextIds.empty()); +} + +template +static void checkNode(const ContextNode *Node, + bool CheckEdges = false) { + // Node's context ids should be the union of both its callee and caller edge + // context ids. + if (Node->CallerEdges.size()) { + auto EI = Node->CallerEdges.begin(); + auto &FirstEdge = *EI; + DenseSet CallerEdgeContextIds(FirstEdge->ContextIds); + for (EI++; EI != Node->CallerEdges.end(); EI++) { + const auto &Edge = *EI; + if (CheckEdges) + checkEdge(Edge); + set_union(CallerEdgeContextIds, Edge->ContextIds); + } + // Node can have more context ids than callers if some contexts terminate at + // node and some are longer. + assert(Node->ContextIds == CallerEdgeContextIds || + set_is_subset(CallerEdgeContextIds, Node->ContextIds)); + } + if (Node->CalleeEdges.size()) { + auto EI = Node->CalleeEdges.begin(); + auto FirstEdge = *EI; + DenseSet CalleeEdgeContextIds(FirstEdge->ContextIds); + for (EI++; EI != Node->CalleeEdges.end(); EI++) { + const auto &Edge = *EI; + if (CheckEdges) + checkEdge(Edge); + set_union(CalleeEdgeContextIds, Edge->ContextIds); + } + assert(Node->ContextIds == CalleeEdgeContextIds); + } +} + +template +static void checkNodesRecursively( + const ContextNode *Node, + DenseSet *> &Visited) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + checkNode(Node); + for (auto &Edge : Node->CallerEdges) { + // Separate edge and node checking so we don't check edges twice. + checkEdge(Edge); + checkNodesRecursively(Edge->Caller, Visited); + } +} + +template +void CallsiteContextGraph::check() const { + DenseSet Visited; + for (auto &Entry : AllocationCallToContextNodeMap) { + checkNodesRecursively(Entry.second, Visited); + for (auto *Clone : Entry.second->Clones) + checkNodesRecursively(Clone, Visited); + } +} + +namespace { +struct Attributes { + void add(const Twine &Name, const Twine &Value, + const Twine &Comment = Twine()); + void addComment(const Twine &Comment); + std::string getAsString() const; + + std::vector Attrs; + std::string Comments; +}; +} // namespace + +void Attributes::add(const Twine &Name, const Twine &Value, + const Twine &Comment) { + std::string A = Name.str(); + A += "=\""; + A += Value.str(); + A += "\""; + Attrs.push_back(A); + addComment(Comment); +} + +void Attributes::addComment(const Twine &Comment) { + if (!Comment.isTriviallyEmpty()) { + if (Comments.empty()) + Comments = " // "; + else + Comments += ", "; + Comments += Comment.str(); + } +} + +std::string Attributes::getAsString() const { + if (Attrs.empty()) + return ""; + + std::string Ret = "["; + for (auto &A : Attrs) + Ret += A + ","; + Ret.pop_back(); + Ret += "];"; + Ret += Comments; + return Ret; +} + +template +void CallsiteContextGraph::exportToDot( + std::string Path) const { + std::error_code EC; + raw_fd_ostream OS(DotFilePathPrefix + Path, EC, sys::fs::OpenFlags::OF_None); + if (EC) { + errs() << "failed to open " << DotFilePathPrefix + Path << ": " + << EC.message() << '\n'; + errs().flush(); + exit(1); + } + + auto NodeId = [](const ContextNode *Node) { + std::stringstream sstream; + sstream << std::hex << "N0x" << (unsigned long long)Node; + std::string result = sstream.str(); + return result; + }; + + auto GetContextIds = [](const DenseSet &ContextIds) { + std::string IdString = " ContextIds:"; + if (ContextIds.size() < 100) { + std::vector SortedIds(ContextIds.begin(), ContextIds.end()); + std::sort(SortedIds.begin(), SortedIds.end()); + for (auto Id : SortedIds) + IdString += " " + std::to_string(Id); + } else { + IdString += " (" + std::to_string(ContextIds.size()) + " ids)"; + } + return IdString; + }; + + auto AddColorAttribute = [](uint8_t AllocTypes, Attributes &A) { + if (AllocTypes == (uint8_t)AllocationType::NotCold) + // Color "brown1" actually looks like a lighter red. + A.add("fillcolor", "brown1", "default"); + else if (AllocTypes == (uint8_t)AllocationType::Cold) + A.add("fillcolor", "cyan", "cold"); + else if (AllocTypes == + ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) + // Lighter purple. + A.add("fillcolor", "mediumorchid1", "default|cold"); + else + A.add("fillcolor", "gray", "none"); + }; + + auto DrawEdge = [&](const std::shared_ptr &Edge) { + Attributes A; + A.add("tooltip", GetContextIds(Edge->ContextIds)); + AddColorAttribute(Edge->AllocTypes, A); + + OS << " " << NodeId(Edge->Caller) << " -> " << NodeId(Edge->Callee) + << A.getAsString() << "\n"; + }; + + auto NodeLabel = [&](const ContextNode *Node) { + std::string LabelString = + (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + + std::to_string(Node->OrigStackOrAllocId)) + .str(); + LabelString += "\\n"; + if (Node->hasCall()) { + auto Func = NodeToCallingFunc.find(Node); + assert(Func != NodeToCallingFunc.end()); + LabelString += + getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); + } else { + LabelString += "null call"; + if (Node->Recursive) + LabelString += " (recursive)"; + else + LabelString += " (external)"; + } + return LabelString; + }; + + OS << "digraph CallsiteContextGraph {\n"; + + DenseSet Visited; + std::vector Worklist; + for (auto &Entry : AllocationCallToContextNodeMap) { + Worklist.push_back(Entry.second); + for (auto *Clone : Entry.second->Clones) + Worklist.push_back(Clone); + } + while (!Worklist.empty()) { + const ContextNode *Node = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(Node).second) + continue; + for (const auto &Edge : Node->CallerEdges) + Worklist.push_back(Edge->Caller); + + Attributes A; + A.add("shape", "record", "callsite"); + + A.add("label", NodeLabel(Node)); + std::string TooltipString = NodeId(Node); + TooltipString += GetContextIds(Node->ContextIds); + A.add("tooltip", TooltipString); + AddColorAttribute(Node->AllocTypes, A); + A.add("style", "filled"); + if (Node->CloneOf) { + A.add("color", "blue"); + A.add("style", "filled, bold, dashed"); + } else + A.add("style", "filled"); + + OS << " " << NodeId(Node) << " " << A.getAsString() << "\n"; + } + + OS << " // Edges:\n"; + + Visited.clear(); + Worklist.clear(); + for (auto &Entry : AllocationCallToContextNodeMap) { + Worklist.push_back(Entry.second); + for (auto *Clone : Entry.second->Clones) + Worklist.push_back(Clone); + } + while (!Worklist.empty()) { + const ContextNode *Node = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(Node).second) + continue; + + for (const auto &Edge : Node->CallerEdges) { + Worklist.push_back(Edge->Caller); + + DrawEdge(Edge); + } + } + + OS << "}"; +} + +template +ContextNode * +CallsiteContextGraph::moveEdgeToNewCalleeClone( + const std::shared_ptr &Edge, EdgeIter *CallerEdgeI) { + ContextNode *Node = Edge->Callee; + ContextNode *Clone = Node->clone(); + assert(NodeToCallingFunc.count(Node)); + NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; + moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone*/ true); + return Clone; +} + +template +void CallsiteContextGraph:: + moveEdgeToExistingCalleeClone(const std::shared_ptr &Edge, + ContextNode *NewCallee, EdgeIter *CallerEdgeI, + bool NewClone) { + // NewCallee and Edge's current callee must be clones of the same original + // node (Edge's current callee may be the original node too). + assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); + auto &EdgeContextIds = Edge->getContextIds(); + ContextNode *OldCallee = Edge->Callee; + if (CallerEdgeI) + *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); + else + OldCallee->eraseCallerEdge(Edge.get()); + Edge->Callee = NewCallee; + NewCallee->CallerEdges.push_back(Edge); + // Don't need to update Edge's context ids since we are simply reconnecting + // it. + set_subtract(OldCallee->ContextIds, EdgeContextIds); + NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end()); + NewCallee->AllocTypes |= Edge->AllocTypes; + OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); + // OldCallee alloc type should be None iff its context id set is now empty. + assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == + OldCallee->ContextIds.empty()); + // Now walk the old callee node's callee edges and move Edge's context ids + // over to the corresponding edge into the clone (which is created here if + // this is a newly created clone). + for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { + // The context ids moving to the new callee are the subset of this edge's + // context ids and the context ids on the caller edge being moved. + DenseSet EdgeContextIdsToMove = + set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds); + set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); + OldCalleeEdge->AllocTypes = + computeAllocType(OldCalleeEdge->getContextIds()); + if (!NewClone) { + // Update context ids / alloc type on corresponding edge to NewCallee. + // There is a chance this may not exist if we are reusing an existing + // clone, specifically during function assignment, where we would have + // removed none type edges after creating the clone. If we can't find + // a corresponding edge there, fall through to the cloning below. + if (auto *NewCalleeEdge = + NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { + NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), + EdgeContextIdsToMove.end()); + NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); + continue; + } + } + auto NewEdge = std::make_shared( + OldCalleeEdge->Callee, NewCallee, + computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); + NewCallee->CalleeEdges.push_back(NewEdge); + NewEdge->Callee->CallerEdges.push_back(NewEdge); + } + if (VerifyCCG) { + checkNode(OldCallee); + checkNode(NewCallee); + for (const auto &Edge : OldCallee->CalleeEdges) + checkNode(Edge->Callee); + for (const auto &Edge : NewCallee->CalleeEdges) + checkNode(Edge->Callee); + } +} + +template +bool CallsiteContextGraph::process() { + if (DumpCCG) { + dbgs() << "CCG before cloning:\n"; + dbgs() << *this; + } + if (ExportToDot) + exportToDot("ccg.postbuild.dot"); + + if (VerifyCCG) { + check(); + } + + return false; +} + +bool PGHOContextDisambiguation::processModule( + Module &M) { + bool Changed = false; + + ModuleCallsiteContextGraph CCG(M); + Changed = CCG.process(); + + return Changed; +} + +PreservedAnalyses PGHOContextDisambiguation::run(Module &M, + ModuleAnalysisManager &AM) { + if (!processModule(M)) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} + +void PGHOContextDisambiguation::run( + ModuleSummaryIndex &Index, + function_ref + isPrevailing) { + IndexCallsiteContextGraph CCG(Index, isPrevailing); + CCG.process(); +} diff --git a/llvm/test/ThinLTO/X86/pgho-basic.ll b/llvm/test/ThinLTO/X86/pgho-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-basic.ll @@ -0,0 +1,211 @@ +;; Test callsite context graph generation for simple call graph with +;; two memprof contexts and no inlining. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'pgho-basic.ll' +source_filename = "pgho-basic.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #0 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !7 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !8 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #6 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #6 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #2 + +declare i32 @sleep(i32 noundef) #3 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barv() #4 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !9, !callsite !14 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3bazv() #4 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #4 { +entry: + %call = call noundef ptr @_Z3bazv(), !callsite !16 + ret ptr %call +} + +attributes #0 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #2 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin nounwind } +attributes #7 = { builtin allocsize(0) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{i64 8632435727821051414} +!8 = !{i64 -3421689549917153178} +!9 = !{!10, !12} +!10 = !{!11, !"notcold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!12 = !{!13, !"cold"} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!14 = !{i64 9086428284934609951} +!15 = !{i64 -5964873800580613432} +!16 = !{i64 2732490490862098848} + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2, 3, 0 +; DUMP: AllocType 2 StackIds: 2, 3, 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: Callee: 10756268697391741933 (_Z3barv) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: Callee: 17547784407117670007 (_Z3bazv) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 6988045695824228603 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 6988045695824228603 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> alloc",tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv",tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN2]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: // Edges: +; DOT: N[[BAZ]] -> N[[BAR]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[FOO]] -> N[[BAZ]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN2]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: } diff --git a/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll @@ -0,0 +1,311 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *D() { +;; return new char[10]; +;; } +;; +;; char *F() { +;; return D(); +;; } +;; +;; char *C() { +;; return D(); +;; } +;; +;; char *B() { +;; return C(); +;; } +;; +;; char *E() { +;; return C(); +;; } +;; int main(int argc, char **argv) { +;; char *x = B(); // cold +;; char *y = E(); // cold +;; char *z = F(); // default +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; delete[] z; +;; sleep(10); +;; delete[] x; +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of C into both B and E. +;; Since both allocation contexts via C are cold, the matched memprof +;; metadata has the context pruned above C's callsite. This requires +;; matching the stack node for C to callsites where it was inlined (i.e. +;; the callsites in B and E that have callsite metadata that includes C's). +;; It also requires duplication of that node in the graph as well as the +;; duplication of the context ids along that path through the graph, +;; so that we can represent the duplicated (via inlining) C callsite. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST + +; ModuleID = 'duplicate-context-ids.ll' +source_filename = "duplicate-context-ids.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Dv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Fv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !13 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Cv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Bv() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Ev() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !16 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %z = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z1Bv(), !callsite !17 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z1Ev(), !callsite !18 + store ptr %call1, ptr %y, align 8 + %call2 = call noundef ptr @_Z1Fv(), !callsite !19 + store ptr %call2, ptr %z, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %z, align 8 + %isnull = icmp eq ptr %3, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %3) #7 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call3 = call i32 @sleep(i32 noundef 10) + %4 = load ptr, ptr %x, align 8 + %isnull4 = icmp eq ptr %4, null + br i1 %isnull4, label %delete.end6, label %delete.notnull5 + +delete.notnull5: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %4) #7 + br label %delete.end6 + +delete.end6: ; preds = %delete.notnull5, %delete.end + %5 = load ptr, ptr %y, align 8 + %isnull7 = icmp eq ptr %5, null + br i1 %isnull7, label %delete.end9, label %delete.notnull8 + +delete.notnull8: ; preds = %delete.end6 + call void @_ZdaPv(ptr noundef %5) #7 + br label %delete.end9 + +delete.end9: ; preds = %delete.notnull8, %delete.end6 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"cold"} +!9 = !{i64 6541423618768552252, i64 -6270142974039008131} +!10 = !{!11, !"notcold"} +!11 = !{i64 6541423618768552252, i64 -4903163940066524832} +!12 = !{i64 6541423618768552252} +!13 = !{i64 -4903163940066524832} +!14 = !{i64 -6270142974039008131} +!15 = !{i64 -6270142974039008131, i64 -184525619819294889} +!16 = !{i64 -6270142974039008131, i64 1905834578520680781} +!17 = !{i64 8632435727821051414} +!18 = !{i64 -3421689549917153178} +!19 = !{i64 6307901912192269588} + + +;; After adding only the alloc node memprof metadata, we only have 2 contexts. + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 + +; DUMP: Node [[C]] +; DUMP: null Call +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[F]] +; DUMP: null Call +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +;; After updating for callsite metadata, we should have generated context ids 3 and 4, +;; along with 2 new nodes for those callsites. All have the same allocation type +;; behavior as the original C node. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3 +; DUMP: Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 + +; DUMP: Node [[F]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[C2]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[E]] +; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + + +; DOTPRE: digraph CallsiteContextGraph { +; DOTPRE: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> alloc",tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOTPRE: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOTPRE: N[[C:0x[a-z0-9]+]] [shape="record",label="OrigId: 12176601099670543485\nnull call (external)",tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPRE: // Edges: +; DOTPRE: N[[C]] -> N[[D]][tooltip=" ContextIds: 1",fillcolor="cyan"]; // cold +; DOTPRE: N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default +; DOTPRE: } + + +; DOTPOST: digraph CallsiteContextGraph { +; DOTPOST: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> alloc",tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOTPOST: N[[E:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Ev -\> _Z1Dv",tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[B:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Bv -\> _Z1Dv",tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[C:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Cv -\> _Z1Dv",tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOTPOST: // Edges: +; DOTPOST: N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default +; DOTPOST: N[[C]] -> N[[D]][tooltip=" ContextIds: 3",fillcolor="cyan"]; // cold +; DOTPOST: N[[B]] -> N[[D]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOTPOST: N[[E]] -> N[[D]][tooltip=" ContextIds: 1",fillcolor="cyan"]; // cold +; DOTPOST: } diff --git a/llvm/test/ThinLTO/X86/pgho-indirectcall.ll b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll @@ -0,0 +1,449 @@ +;; Tests callsite context graph generation for call graph containing indirect +;; calls. Currently this should result in conservative behavior, such that the +;; indirect call receives a null call in its graph node, to prevent subsequent +;; cloning. +;; +;; Original code looks like: +;; +;; char *foo() { +;; return new char[10]; +;; } +;; class A { +;; public: +;; virtual char *x() { return foo(); } +;; }; +;; class B : public A { +;; public: +;; char *x() final { return foo(); } +;; }; +;; char *bar(A *a) { +;; return a->x(); +;; } +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; B b; +;; char *z = bar(&b); +;; char *w = bar(&b); +;; A a; +;; char *r = bar(&a); +;; char *s = bar(&a); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; memset(w, 0, 10); +;; memset(r, 0, 10); +;; memset(s, 0, 10); +;; delete[] x; +;; delete[] w; +;; delete[] r; +;; sleep(10); +;; delete[] y; +;; delete[] z; +;; delete[] s; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Compiled without optimization to prevent inlining and devirtualization. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'indirectcall.ll' +source_filename = "indirectcall.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.B = type { %class.A } +%class.A = type { ptr } + +$_ZN1BC2Ev = comdat any + +$_ZN1AC2Ev = comdat any + +$_ZN1A1xEv = comdat any + +$_ZN1B1xEv = comdat any + +$_ZTV1B = comdat any + +$_ZTS1B = comdat any + +$_ZTS1A = comdat any + +$_ZTI1A = comdat any + +$_ZTI1B = comdat any + +$_ZTV1A = comdat any + +@_ZTV1B = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1xEv] }, comdat, align 8, !type !0, !type !1, !type !2, !type !3 +@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr +@_ZTS1B = internal constant [3 x i8] c"1B\00", comdat, align 1 +@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr +@_ZTS1A = internal constant [3 x i8] c"1A\00", comdat, align 1 +@_ZTI1A = internal constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8 +@_ZTI1B = internal constant { ptr, ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8 +@_ZTV1A = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1xEv] }, comdat, align 8, !type !0, !type !1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr, align 8 + store ptr %a, ptr %a.addr, align 8 + %0 = load ptr, ptr %a.addr, align 8 + %vtable = load ptr, ptr %0, align 8 + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %1 = load ptr, ptr %vfn, align 8 + %call = call noundef ptr %1(ptr noundef nonnull align 8 dereferenceable(8) %0), !callsite !11 + ret ptr %call +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #1 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %b = alloca %class.B, align 8 + %z = alloca ptr, align 8 + %w = alloca ptr, align 8 + %a = alloca %class.A, align 8 + %r = alloca ptr, align 8 + %s = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !12 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !13 + store ptr %call1, ptr %y, align 8 + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) #7 + %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !14 + store ptr %call2, ptr %z, align 8 + %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !15 + store ptr %call3, ptr %w, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) #7 + %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !16 + store ptr %call4, ptr %r, align 8 + %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !17 + store ptr %call5, ptr %s, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %w, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %r, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %s, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %w, align 8 + %isnull6 = icmp eq ptr %7, null + br i1 %isnull6, label %delete.end8, label %delete.notnull7 + +delete.notnull7: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #8 + br label %delete.end8 + +delete.end8: ; preds = %delete.notnull7, %delete.end + %8 = load ptr, ptr %r, align 8 + %isnull9 = icmp eq ptr %8, null + br i1 %isnull9, label %delete.end11, label %delete.notnull10 + +delete.notnull10: ; preds = %delete.end8 + call void @_ZdaPv(ptr noundef %8) #8 + br label %delete.end11 + +delete.end11: ; preds = %delete.notnull10, %delete.end8 + %call12 = call i32 @sleep(i32 noundef 10) + %9 = load ptr, ptr %y, align 8 + %isnull13 = icmp eq ptr %9, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end11 + call void @_ZdaPv(ptr noundef %9) #8 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end11 + %10 = load ptr, ptr %z, align 8 + %isnull16 = icmp eq ptr %10, null + br i1 %isnull16, label %delete.end18, label %delete.notnull17 + +delete.notnull17: ; preds = %delete.end15 + call void @_ZdaPv(ptr noundef %10) #8 + br label %delete.end18 + +delete.end18: ; preds = %delete.notnull17, %delete.end15 + %11 = load ptr, ptr %s, align 8 + %isnull19 = icmp eq ptr %11, null + br i1 %isnull19, label %delete.end21, label %delete.notnull20 + +delete.notnull20: ; preds = %delete.end18 + call void @_ZdaPv(ptr noundef %11) #8 + br label %delete.end21 + +delete.end21: ; preds = %delete.notnull20, %delete.end18 + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) #7 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1A1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !18 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1B1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !19 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #9, !memprof !20, !callsite !33 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #6 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { noinline nounwind optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { nounwind } +attributes #8 = { builtin nounwind } +attributes #9 = { builtin allocsize(0) } + +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFPcvE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFPcvE.virtual"} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"PIE Level", i32 2} +!9 = !{i32 7, !"uwtable", i32 2} +!10 = !{i32 7, !"frame-pointer", i32 2} +!11 = !{i64 -4820244510750103755} +!12 = !{i64 8632435727821051414} +!13 = !{i64 -3421689549917153178} +!14 = !{i64 6792096022461663180} +!15 = !{i64 -2709642582978494015} +!16 = !{i64 748269490701775343} +!17 = !{i64 -5747251260480066785} +!18 = !{i64 8256774051149711748} +!19 = !{i64 -4831879094954754638} +!20 = !{!21, !23, !25, !27, !29, !31} +!21 = !{!22, !"notcold"} +!22 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343} +!23 = !{!24, !"cold"} +!24 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785} +!25 = !{!26, !"notcold"} +!26 = !{i64 2732490490862098848, i64 8632435727821051414} +!27 = !{!28, !"cold"} +!28 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180} +!29 = !{!30, !"notcold"} +!30 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015} +!31 = !{!32, !"cold"} +!32 = !{i64 2732490490862098848, i64 -3421689549917153178} +!33 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[FOO:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 6, 8, 4 +; DUMP: AllocType 2 StackIds: 6, 8, 5 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: AllocType 2 StackIds: 7, 8, 2 +; DUMP: AllocType 1 StackIds: 7, 8, 3 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6 + +; DUMP: Node [[AX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DUMP: Node [[BAR]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5 + +; DUMP: Node [[MAIN3]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN4]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN5]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN6]] +; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[BX]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6 +; DUMP: CallerEdges: + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> alloc",tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[BX:0x[a-z0-9]+]] [shape="record",label="OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov",tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: 13626499562959447861\nnull call (external)",tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 15737101490731057601\nmain -\> _Z3barP1A",tooltip="N[[MAIN2]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[MAIN3:0x[a-z0-9]+]] [shape="record",label="OrigId: 6792096022461663180\nmain -\> _Z3barP1A",tooltip="N[[MAIN3]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN4:0x[a-z0-9]+]] [shape="record",label="OrigId: 12699492813229484831\nmain -\> _Z3barP1A",tooltip="N[[MAIN4]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN5:0x[a-z0-9]+]] [shape="record",label="OrigId: 748269490701775343\nmain -\> _Z3barP1A",tooltip="N[[MAIN5]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[MAIN6:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN6]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[AX:0x[a-z0-9]+]] [shape="record",label="OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov",tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: // Edges: +; DOT: N[[AX]] -> N[[FOO]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN6]] -> N[[FOO]][tooltip=" ContextIds: 3",fillcolor="brown1"]; // default +; DOT: N[[BX]] -> N[[FOO]][tooltip=" ContextIds: 4 5",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 6",fillcolor="cyan"]; // cold +; DOT: N[[BAR]] -> N[[BX]][tooltip=" ContextIds: 4 5",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN5]] -> N[[BAR]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN4]] -> N[[BAR]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: N[[MAIN3]] -> N[[BAR]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOT: N[[MAIN2]] -> N[[BAR]][tooltip=" ContextIds: 5",fillcolor="brown1"]; // default +; DOT: N[[BAR]] -> N[[AX]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: } diff --git a/llvm/test/ThinLTO/X86/pgho-inlined.ll b/llvm/test/ThinLTO/X86/pgho-inlined.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-inlined.ll @@ -0,0 +1,244 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and partial inlining, requiring generation of a new fused node to +;; represent the inlined sequence while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of baz into foo, and +;; bar into baz. Due to the inlining of bar we will initially have two +;; allocation nodes in the graph. This tests that we correctly match +;; foo (with baz inlined) onto the graph nodes first, and generate a new +;; fused node for it. We should then not match baz (with bar inlined) as that +;; is not reached by the MIB contexts (since all calls from main will look +;; like main -> foo(+baz) -> bar after the inlining reflected in this IR). + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'inlined.ll' +source_filename = "inlined.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3barv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3bazv() #0 { +entry: + %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !13 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !15 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !16 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #8 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432} +!14 = !{i64 -5964873800580613432, i64 2732490490862098848} +!15 = !{i64 8632435727821051414} +!16 = !{i64 -3421689549917153178} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 + +;; This is the node synthesized for the call to bar in foo that was created +;; by inlining baz into foo. +; DUMP: Node [[FOO]] +; DUMP: Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 + +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[BAZ:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 1, 2 +; DUMP: AllocType 2 StackIds: 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; This is leftover from the MIB on the alloc inlined into baz. It is not +;; matched with any call, since there is no such node in the IR. Due to the +;; null call it will not participate in any context transformations. +; DUMP: Node [[FOO2]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3bazv -\> alloc",tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc2\n_Z3barv -\> alloc",tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO2:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z3foov -\> _Z3barv",tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: // Edges: +; DOT: N[[FOO]] -> N[[BAR]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN2]] -> N[[FOO]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: N[[FOO2]] -> N[[BAZ]][tooltip=" ContextIds: 3 4",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO2]][tooltip=" ContextIds: 3",fillcolor="brown1"]; // default +; DOT: N[[MAIN2]] -> N[[FOO2]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOT: } diff --git a/llvm/test/ThinLTO/X86/pgho-inlined2.ll b/llvm/test/ThinLTO/X86/pgho-inlined2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-inlined2.ll @@ -0,0 +1,179 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and multiple levels of inlining, requiring generation of new +;; fused nodes to represent the inlined sequence while matching callsite +;; nodes onto the graph. In particular this tests the case where a function +;; has inlined a callee containing an inlined callee. +;; +;; Original code looks like: +;; +;; char *bar() __attribute((noinline)) { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Both foo and baz are inlined into main, at both foo callsites. +;; We should update the graph for new fused nodes for both of those inlined +;; callsites to bar. +;; +;; Note that baz and bar are both dead due to the inlining, but have been left +;; in the input IR to ensure that the MIB call chain is matched to the longer +;; inline sequences from main. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Z3barv,plx \ +; RUN: -r=%t.o,_Z3bazv,plx \ +; RUN: -r=%t.o,_Z3foov,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP + + +; ModuleID = 'inlined2.ll' +source_filename = "inlined2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline uwtable +define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 { +entry: + %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12, !heapallocsite !13 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #2 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress norecurse uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #3 { +delete.end5: + %call.i.i = call noundef ptr @_Z3barv(), !callsite !16 + %call.i.i8 = call noundef ptr @_Z3barv(), !callsite !17 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call.i.i8, i8 0, i64 10, i1 false) + call void @_ZdaPv(ptr noundef nonnull %call.i.i) #8 + %call2 = call i32 @sleep(i32 noundef 10) + call void @_ZdaPv(ptr noundef nonnull %call.i.i8) #8 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) local_unnamed_addr #5 + +declare i32 @sleep(i32 noundef) local_unnamed_addr #6 + +attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!14 = !{i64 -5964873800580613432} +!15 = !{i64 -5964873800580613432, i64 2732490490862098848} +!16 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!17 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} + +^0 = module: (path: "", hash: (0, 0, 0, 0, 0)) +^1 = gv: (name: "sleep") ; guid = 3188927404885408457 +^2 = gv: (name: "_ZdaPv") ; guid = 8244930240056412646 +^3 = gv: (name: "_Z3foov", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848)))))) ; guid = 9191153033785521275 +^4 = gv: (name: "llvm.memset.p0.i64") ; guid = 12767501690323846396 +^5 = gv: (name: "_Z3bazv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000 +^6 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 7, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7), (callee: ^2), (callee: ^1)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))) ; guid = 15822663052811949562 +^7 = gv: (name: "_Z3barv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^8)), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772 +^8 = gv: (name: "_Znam") ; guid = 18423971256537370017 +^9 = flags: 8 +^10 = blockcount: 4 + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 0, 1, 2 +; DUMP: AllocType 2 StackIds: 0, 1, 3 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +;; This is the node synthesized for the first inlined call chain of main->foo->baz +; DUMP: Node [[MAIN1]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +;; This is the node synthesized for the second inlined call chain of main->foo->baz +; DUMP: Node [[MAIN2]] +; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll @@ -0,0 +1,203 @@ +;; Test callsite context graph generation for simple call graph with +;; two memprof contexts and no inlining. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'basic.ll' +source_filename = "basic.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #0 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !7 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !8 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #6 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #6 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #2 + +declare i32 @sleep(i32 noundef) #3 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barv() #4 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !9, !callsite !14 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3bazv() #4 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #4 { +entry: + %call = call noundef ptr @_Z3bazv(), !callsite !16 + ret ptr %call +} + +attributes #0 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #2 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin nounwind } +attributes #7 = { builtin allocsize(0) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{i64 8632435727821051414} +!8 = !{i64 -3421689549917153178} +!9 = !{!10, !12} +!10 = !{!11, !"notcold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!12 = !{!13, !"cold"} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!14 = !{i64 9086428284934609951} +!15 = !{i64 -5964873800580613432} +!16 = !{i64 2732490490862098848} + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[BAZ]] +; DUMP: %call = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +; DUMP: Node [[FOO]] +; DUMP: %call = call noundef ptr @_Z3bazv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> _Znam",tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv",tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN2]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: // Edges: +; DOT: N[[BAZ]] -> N[[BAR]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[FOO]] -> N[[BAZ]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN2]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: } diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll @@ -0,0 +1,300 @@ +;; Test callsite context graph generation for call graph with with MIBs +;; that have pruned contexts that partially match multiple inlined +;; callsite contexts, requiring duplication of context ids and nodes +;; while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *D() { +;; return new char[10]; +;; } +;; +;; char *F() { +;; return D(); +;; } +;; +;; char *C() { +;; return D(); +;; } +;; +;; char *B() { +;; return C(); +;; } +;; +;; char *E() { +;; return C(); +;; } +;; int main(int argc, char **argv) { +;; char *x = B(); // cold +;; char *y = E(); // cold +;; char *z = F(); // default +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; delete[] z; +;; sleep(10); +;; delete[] x; +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of C into both B and E. +;; Since both allocation contexts via C are cold, the matched memprof +;; metadata has the context pruned above C's callsite. This requires +;; matching the stack node for C to callsites where it was inlined (i.e. +;; the callsites in B and E that have callsite metadata that includes C's). +;; It also requires duplication of that node in the graph as well as the +;; duplication of the context ids along that path through the graph, +;; so that we can represent the duplicated (via inlining) C callsite. + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST + +; ModuleID = 'duplicate-context-ids.ll' +source_filename = "duplicate-context-ids.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Dv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Fv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !13 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Cv() #0 { +entry: + %call = call noundef ptr @_Z1Dv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Bv() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z1Ev() #0 { +entry: + %call.i = call noundef ptr @_Z1Dv(), !callsite !16 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %z = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z1Bv(), !callsite !17 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z1Ev(), !callsite !18 + store ptr %call1, ptr %y, align 8 + %call2 = call noundef ptr @_Z1Fv(), !callsite !19 + store ptr %call2, ptr %z, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %z, align 8 + %isnull = icmp eq ptr %3, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %3) #7 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call3 = call i32 @sleep(i32 noundef 10) + %4 = load ptr, ptr %x, align 8 + %isnull4 = icmp eq ptr %4, null + br i1 %isnull4, label %delete.end6, label %delete.notnull5 + +delete.notnull5: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %4) #7 + br label %delete.end6 + +delete.end6: ; preds = %delete.notnull5, %delete.end + %5 = load ptr, ptr %y, align 8 + %isnull7 = icmp eq ptr %5, null + br i1 %isnull7, label %delete.end9, label %delete.notnull8 + +delete.notnull8: ; preds = %delete.end6 + call void @_ZdaPv(ptr noundef %5) #7 + br label %delete.end9 + +delete.end9: ; preds = %delete.notnull8, %delete.end6 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"cold"} +!9 = !{i64 6541423618768552252, i64 -6270142974039008131} +!10 = !{!11, !"notcold"} +!11 = !{i64 6541423618768552252, i64 -4903163940066524832} +!12 = !{i64 6541423618768552252} +!13 = !{i64 -4903163940066524832} +!14 = !{i64 -6270142974039008131} +!15 = !{i64 -6270142974039008131, i64 -184525619819294889} +!16 = !{i64 -6270142974039008131, i64 1905834578520680781} +!17 = !{i64 8632435727821051414} +!18 = !{i64 -3421689549917153178} +!19 = !{i64 6307901912192269588} + + +;; After adding only the alloc node memprof metadata, we only have 2 contexts. + +; DUMP: CCG before updating call stack chains: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 + +; DUMP: Node [[C]] +; DUMP: null Call +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[F]] +; DUMP: null Call +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +;; After updating for callsite metadata, we should have generated context ids 3 and 4, +;; along with 2 new nodes for those callsites. All have the same allocation type +;; behavior as the original C node. + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[D]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3 +; DUMP: Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 + +; DUMP: Node [[F]] +; DUMP: %call = call noundef ptr @_Z1Dv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[C2]] +; DUMP: %call = call noundef ptr @_Z1Dv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: %call.i = call noundef ptr @_Z1Dv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[E]] +; DUMP: %call.i = call noundef ptr @_Z1Dv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1 +; DUMP: CallerEdges: + + +; DOTPRE: digraph CallsiteContextGraph { +; DOTPRE: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> _Znam",tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOTPRE: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOTPRE: N[[C:0x[a-z0-9]+]] [shape="record",label="OrigId: 12176601099670543485\nnull call (external)",tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPRE: // Edges: +; DOTPRE: N[[C]] -> N[[D]][tooltip=" ContextIds: 1",fillcolor="cyan"]; // cold +; DOTPRE: N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default +; DOTPRE: } + + +; DOTPOST: digraph CallsiteContextGraph { +; DOTPOST: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> _Znam",tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOTPOST: N[[E:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Ev -\> _Z1Dv",tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[B:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Bv -\> _Z1Dv",tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[C:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z1Cv -\> _Z1Dv",tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOTPOST: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOTPOST: // Edges: +; DOTPOST: N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default +; DOTPOST: N[[C]] -> N[[D]][tooltip=" ContextIds: 3",fillcolor="cyan"]; // cold +; DOTPOST: N[[B]] -> N[[D]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOTPOST: N[[E]] -> N[[D]][tooltip=" ContextIds: 1",fillcolor="cyan"]; // cold +; DOTPOST: } diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll @@ -0,0 +1,435 @@ +;; Tests callsite context graph generation for call graph containing indirect +;; calls. Currently this should result in conservative behavior, such that the +;; indirect call receives a null call in its graph node, to prevent subsequent +;; cloning. +;; +;; Original code looks like: +;; +;; char *foo() { +;; return new char[10]; +;; } +;; class A { +;; public: +;; virtual char *x() { return foo(); } +;; }; +;; class B : public A { +;; public: +;; char *x() final { return foo(); } +;; }; +;; char *bar(A *a) { +;; return a->x(); +;; } +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; B b; +;; char *z = bar(&b); +;; char *w = bar(&b); +;; A a; +;; char *r = bar(&a); +;; char *s = bar(&a); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; memset(z, 0, 10); +;; memset(w, 0, 10); +;; memset(r, 0, 10); +;; memset(s, 0, 10); +;; delete[] x; +;; delete[] w; +;; delete[] r; +;; sleep(10); +;; delete[] y; +;; delete[] z; +;; delete[] s; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Compiled without optimization to prevent inlining and devirtualization. + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'indirectcall.ll' +source_filename = "indirectcall.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.B = type { %class.A } +%class.A = type { ptr } + +$_ZN1BC2Ev = comdat any + +$_ZN1AC2Ev = comdat any + +$_ZN1A1xEv = comdat any + +$_ZN1B1xEv = comdat any + +$_ZTV1B = comdat any + +$_ZTS1B = comdat any + +$_ZTS1A = comdat any + +$_ZTI1A = comdat any + +$_ZTI1B = comdat any + +$_ZTV1A = comdat any + +@_ZTV1B = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1xEv] }, comdat, align 8, !type !0, !type !1, !type !2, !type !3 +@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr +@_ZTS1B = internal constant [3 x i8] c"1B\00", comdat, align 1 +@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr +@_ZTS1A = internal constant [3 x i8] c"1A\00", comdat, align 1 +@_ZTI1A = internal constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8 +@_ZTI1B = internal constant { ptr, ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8 +@_ZTV1A = internal unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1xEv] }, comdat, align 8, !type !0, !type !1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr, align 8 + store ptr %a, ptr %a.addr, align 8 + %0 = load ptr, ptr %a.addr, align 8 + %vtable = load ptr, ptr %0, align 8 + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %1 = load ptr, ptr %vfn, align 8 + %call = call noundef ptr %1(ptr noundef nonnull align 8 dereferenceable(8) %0), !callsite !11 + ret ptr %call +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #1 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + %b = alloca %class.B, align 8 + %z = alloca ptr, align 8 + %w = alloca ptr, align 8 + %a = alloca %class.A, align 8 + %r = alloca ptr, align 8 + %s = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !12 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !13 + store ptr %call1, ptr %y, align 8 + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) #7 + %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !14 + store ptr %call2, ptr %z, align 8 + %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b), !callsite !15 + store ptr %call3, ptr %w, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) #7 + %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !16 + store ptr %call4, ptr %r, align 8 + %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a), !callsite !17 + store ptr %call5, ptr %s, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %z, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %w, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %r, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %s, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %w, align 8 + %isnull6 = icmp eq ptr %7, null + br i1 %isnull6, label %delete.end8, label %delete.notnull7 + +delete.notnull7: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #8 + br label %delete.end8 + +delete.end8: ; preds = %delete.notnull7, %delete.end + %8 = load ptr, ptr %r, align 8 + %isnull9 = icmp eq ptr %8, null + br i1 %isnull9, label %delete.end11, label %delete.notnull10 + +delete.notnull10: ; preds = %delete.end8 + call void @_ZdaPv(ptr noundef %8) #8 + br label %delete.end11 + +delete.end11: ; preds = %delete.notnull10, %delete.end8 + %call12 = call i32 @sleep(i32 noundef 10) + %9 = load ptr, ptr %y, align 8 + %isnull13 = icmp eq ptr %9, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end11 + call void @_ZdaPv(ptr noundef %9) #8 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end11 + %10 = load ptr, ptr %z, align 8 + %isnull16 = icmp eq ptr %10, null + br i1 %isnull16, label %delete.end18, label %delete.notnull17 + +delete.notnull17: ; preds = %delete.end15 + call void @_ZdaPv(ptr noundef %10) #8 + br label %delete.end18 + +delete.end18: ; preds = %delete.notnull17, %delete.end15 + %11 = load ptr, ptr %s, align 8 + %isnull19 = icmp eq ptr %11, null + br i1 %isnull19, label %delete.end21, label %delete.notnull20 + +delete.notnull20: ; preds = %delete.end18 + call void @_ZdaPv(ptr noundef %11) #8 + br label %delete.end21 + +delete.end21: ; preds = %delete.notnull20, %delete.end18 + ret i32 0 +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) #7 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: noinline nounwind optnone uwtable +define internal void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #2 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1A1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !18 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_ZN1B1xEv(ptr noundef nonnull align 8 dereferenceable(8) %this) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !19 + ret ptr %call +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #9, !memprof !20, !callsite !33 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #6 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { noinline nounwind optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { nounwind } +attributes #8 = { builtin nounwind } +attributes #9 = { builtin allocsize(0) } + +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFPcvE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFPcvE.virtual"} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"PIE Level", i32 2} +!9 = !{i32 7, !"uwtable", i32 2} +!10 = !{i32 7, !"frame-pointer", i32 2} +!11 = !{i64 -4820244510750103755} +!12 = !{i64 8632435727821051414} +!13 = !{i64 -3421689549917153178} +!14 = !{i64 6792096022461663180} +!15 = !{i64 -2709642582978494015} +!16 = !{i64 748269490701775343} +!17 = !{i64 -5747251260480066785} +!18 = !{i64 8256774051149711748} +!19 = !{i64 -4831879094954754638} +!20 = !{!21, !23, !25, !27, !29, !31} +!21 = !{!22, !"notcold"} +!22 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343} +!23 = !{!24, !"cold"} +!24 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785} +!25 = !{!26, !"notcold"} +!26 = !{i64 2732490490862098848, i64 8632435727821051414} +!27 = !{!28, !"cold"} +!28 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180} +!29 = !{!30, !"notcold"} +!30 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015} +!31 = !{!32, !"cold"} +!32 = !{i64 2732490490862098848, i64 -3421689549917153178} +!33 = !{i64 2732490490862098848} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[FOO:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 3 4 5 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6 + +; DUMP: Node [[AX]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DUMP: Node [[BAR]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5 + +; DUMP: Node [[MAIN3]] +; DUMP: %call4 = call noundef ptr @_Z3barP1A(ptr noundef %a) (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN4]] +; DUMP: %call5 = call noundef ptr @_Z3barP1A(ptr noundef %a) (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN5]] +; DUMP: %call2 = call noundef ptr @_Z3barP1A(ptr noundef %b) (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN6]] +; DUMP: %call3 = call noundef ptr @_Z3barP1A(ptr noundef %b) (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: CallerEdges: + +; DUMP: Node [[BX]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 4 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5 + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6 +; DUMP: CallerEdges: + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> _Znam",tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[BX:0x[a-z0-9]+]] [shape="record",label="OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov",tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +;; Bar contains an indirect call, with multiple targets. It's call should be null. +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: 13626499562959447861\nnull call (external)",tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 15737101490731057601\nmain -\> _Z3barP1A",tooltip="N[[MAIN2]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[MAIN3:0x[a-z0-9]+]] [shape="record",label="OrigId: 6792096022461663180\nmain -\> _Z3barP1A",tooltip="N[[MAIN3]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN4:0x[a-z0-9]+]] [shape="record",label="OrigId: 12699492813229484831\nmain -\> _Z3barP1A",tooltip="N[[MAIN4]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN5:0x[a-z0-9]+]] [shape="record",label="OrigId: 748269490701775343\nmain -\> _Z3barP1A",tooltip="N[[MAIN5]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[MAIN6:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN6]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[AX:0x[a-z0-9]+]] [shape="record",label="OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov",tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: // Edges: +; DOT: N[[AX]] -> N[[FOO]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN6]] -> N[[FOO]][tooltip=" ContextIds: 3",fillcolor="brown1"]; // default +; DOT: N[[BX]] -> N[[FOO]][tooltip=" ContextIds: 4 5",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 6",fillcolor="cyan"]; // cold +; DOT: N[[BAR]] -> N[[BX]][tooltip=" ContextIds: 4 5",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN5]] -> N[[BAR]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN4]] -> N[[BAR]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: N[[MAIN3]] -> N[[BAR]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOT: N[[MAIN2]] -> N[[BAR]][tooltip=" ContextIds: 5",fillcolor="brown1"]; // default +; DOT: N[[BAR]] -> N[[AX]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: } diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll @@ -0,0 +1,233 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and partial inlining, requiring generation of a new fused node to +;; represent the inlined sequence while matching callsite nodes onto the graph. +;; +;; Original code looks like: +;; +;; char *bar() { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The code below was created by forcing inlining of baz into foo, and +;; bar into baz. Due to the inlining of bar we will initially have two +;; allocation nodes in the graph. This tests that we correctly match +;; foo (with baz inlined) onto the graph nodes first, and generate a new +;; fused node for it. We should then not match baz (with bar inlined) as that +;; is not reached by the MIB contexts (since all calls from main will look +;; like main -> foo(+baz) -> bar after the inlining reflected in this IR). + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT + +; ModuleID = 'inlined.ll' +source_filename = "inlined.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3barv() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress uwtable +define internal noundef ptr @_Z3bazv() #0 { +entry: + %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !13 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal noundef ptr @_Z3foov() #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call.i +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %x = alloca ptr, align 8 + %y = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + %call = call noundef ptr @_Z3foov(), !callsite !15 + store ptr %call, ptr %x, align 8 + %call1 = call noundef ptr @_Z3foov(), !callsite !16 + store ptr %call1, ptr %y, align 8 + %0 = load ptr, ptr %x, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %y, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %x, align 8 + %isnull = icmp eq ptr %2, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %2) #8 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %call2 = call i32 @sleep(i32 noundef 10) + %3 = load ptr, ptr %y, align 8 + %isnull3 = icmp eq ptr %3, null + br i1 %isnull3, label %delete.end5, label %delete.notnull4 + +delete.notnull4: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %3) #8 + br label %delete.end5 + +delete.end5: ; preds = %delete.notnull4, %delete.end + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !{i64 9086428284934609951, i64 -5964873800580613432} +!14 = !{i64 -5964873800580613432, i64 2732490490862098848} +!15 = !{i64 8632435727821051414} +!16 = !{i64 -3421689549917153178} + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 + +;; This is the node synthesized for the call to bar in foo that was created +;; by inlining baz into foo. +; DUMP: Node [[FOO]] +; DUMP: %call.i = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +; DUMP: Node [[MAIN1]] +; DUMP: %call = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +; DUMP: Node [[MAIN2]] +; DUMP: %call1 = call noundef ptr @_Z3foov() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 +; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: + +; DUMP: Node [[BAZ:0x[a-z0-9]+]] +; DUMP: %call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4 + +;; This is leftover from the MIB on the alloc inlined into baz. It is not +;; matched with any call, since there is no such node in the IR. Due to the +;; null call it will not participate in any context transformations. +; DUMP: Node [[FOO2]] +; DUMP: null Call +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 3 4 +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 + + +; DOT: digraph CallsiteContextGraph { +; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc2\n_Z3bazv -\> _Znam",tooltip="N[[BAZ]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO2:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[MAIN2:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold +; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 8632435727821051414\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled"]; // callsite, default +; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> _Znam",tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 0\n_Z3foov -\> _Z3barv",tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold +; DOT: // Edges: +; DOT: N[[FOO2]] -> N[[BAZ]][tooltip=" ContextIds: 3 4",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO2]][tooltip=" ContextIds: 3",fillcolor="brown1"]; // default +; DOT: N[[MAIN2]] -> N[[FOO2]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold +; DOT: N[[FOO]] -> N[[BAR]][tooltip=" ContextIds: 1 2",fillcolor="mediumorchid1"]; // default|cold +; DOT: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default +; DOT: N[[MAIN2]] -> N[[FOO]][tooltip=" ContextIds: 2",fillcolor="cyan"]; // cold +; DOT: } diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/inlined2.ll b/llvm/test/Transforms/PGHOContextDisambiguation/inlined2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/inlined2.ll @@ -0,0 +1,168 @@ +;; Test callsite context graph generation for call graph with two memprof +;; contexts and multiple levels of inlining, requiring generation of new +;; fused nodes to represent the inlined sequence while matching callsite +;; nodes onto the graph. In particular this tests the case where a function +;; has inlined a callee containing an inlined callee. +;; +;; Original code looks like: +;; +;; char *bar() __attribute((noinline)) { +;; return new char[10]; +;; } +;; +;; char *baz() { +;; return bar(); +;; } +;; +;; char *foo() { +;; return baz(); +;; } +;; +;; int main(int argc, char **argv) { +;; char *x = foo(); +;; char *y = foo(); +;; memset(x, 0, 10); +;; memset(y, 0, 10); +;; delete[] x; +;; sleep(10); +;; delete[] y; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; Both foo and baz are inlined into main, at both foo callsites. +;; We should update the graph for new fused nodes for both of those inlined +;; callsites to bar. +;; +;; Note that baz and bar are both dead due to the inlining, but have been left +;; in the input IR to ensure that the MIB call chain is matched to the longer +;; inline sequences from main. + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP + + +; ModuleID = 'inlined2.ll' +source_filename = "inlined2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline uwtable +define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 { +entry: + %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !memprof !7, !callsite !12, !heapallocsite !13 + ret ptr %call +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1 + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #2 { +entry: + %call = call noundef ptr @_Z3barv(), !callsite !14 + ret ptr %call +} + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #2 { +entry: + %call.i = call noundef ptr @_Z3barv(), !callsite !15 + ret ptr %call.i +} + +; Function Attrs: mustprogress norecurse uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #3 { +delete.end5: + %call.i.i = call noundef ptr @_Z3barv(), !callsite !16 + %call.i.i8 = call noundef ptr @_Z3barv(), !callsite !17 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call.i.i8, i8 0, i64 10, i1 false) + call void @_ZdaPv(ptr noundef nonnull %call.i.i) #8 + %call2 = call i32 @sleep(i32 noundef 10) + call void @_ZdaPv(ptr noundef nonnull %call.i.i8) #8 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) local_unnamed_addr #5 + +declare i32 @sleep(i32 noundef) local_unnamed_addr #6 + +attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10} +!8 = !{!9, !"notcold"} +!9 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!10 = !{!11, !"cold"} +!11 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!12 = !{i64 9086428284934609951} +!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!14 = !{i64 -5964873800580613432} +!15 = !{i64 -5964873800580613432, i64 2732490490862098848} +!16 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!17 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} + +^0 = module: (path: "", hash: (0, 0, 0, 0, 0)) +^1 = gv: (name: "sleep") ; guid = 3188927404885408457 +^2 = gv: (name: "_ZdaPv") ; guid = 8244930240056412646 +^3 = gv: (name: "_Z3foov", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848)))))) ; guid = 9191153033785521275 +^4 = gv: (name: "llvm.memset.p0.i64") ; guid = 12767501690323846396 +^5 = gv: (name: "_Z3bazv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000 +^6 = gv: (name: "main", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 7, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^7), (callee: ^2), (callee: ^1)), callsites: ((callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (callee: ^7, clones: (0), stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))) ; guid = 15822663052811949562 +^7 = gv: (name: "_Z3barv", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 1, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 1, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^8)), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772 +^8 = gv: (name: "_Znam") ; guid = 18423971256537370017 +^9 = flags: 8 +^10 = blockcount: 4 + + +; DUMP: CCG before cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[BAR:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #7, !heapallocsite !7 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 2 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1 +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2 + +;; This is the node synthesized for the first inlined call chain of main->foo->baz +; DUMP: Node [[MAIN1]] +; DUMP: %call.i.i = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1 +; DUMP: CallerEdges: + +;; This is the node synthesized for the second inlined call chain of main->foo->baz +; DUMP: Node [[MAIN2]] +; DUMP: %call.i.i8 = call noundef ptr @_Z3barv() (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 2 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; DUMP: CallerEdges: diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/pass-pipeline.ll b/llvm/test/Transforms/PGHOContextDisambiguation/pass-pipeline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/pass-pipeline.ll @@ -0,0 +1,41 @@ +;; Test that PGHOContextDisambiguation is enabled under the expected conditions +;; and in the expected position. + +;; Pass is not currently enabled by default at any opt level. +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" + +;; Pass should not run even under option at O0/O1. +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: -enable-pgho-context-disambiguation \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: -enable-pgho-context-disambiguation \ +; RUN: 2>&1 | FileCheck %s --implicit-check-not="Running pass: PGHOContextDisambiguation" + +;; Pass should be enabled under option at O2/O3. +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: -enable-pgho-context-disambiguation \ +; RUN: 2>&1 | FileCheck %s --check-prefix=ENABLED +; RUN: opt -debug-pass-manager -passes='lto' -S %s \ +; RUN: -enable-pgho-context-disambiguation \ +; RUN: 2>&1 | FileCheck %s --check-prefix=ENABLED + +;; When enabled, PGHOContextDisambiguation runs just after inlining. +; ENABLED: Running pass: InlinerPass +; ENABLED: Invalidating analysis: InlineAdvisorAnalysis +; ENABLED: Running pass: PGHOContextDisambiguation + +define noundef ptr @_Z3barv() { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) + ret ptr %call +} + +declare noundef nonnull ptr @_Znam(i64 noundef)