diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -25,11 +25,14 @@ class GlobalValueSummary; class Module; class ModuleSummaryIndex; +class OptimizationRemarkEmitter; class MemProfContextDisambiguation : public PassInfoMixin { /// Run the context disambiguator on \p M, returns true if any changes made. - bool processModule(Module &M); + bool processModule( + Module &M, + function_ref OREGetter); public: MemProfContextDisambiguation() {} diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -27,8 +27,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -39,6 +41,7 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" #include #include using namespace llvm; @@ -46,6 +49,13 @@ #define DEBUG_TYPE "memprof-context-disambiguation" +STATISTIC(FunctionClonesAnalysis, + "Number of function clones created during whole program analysis"); +STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " + "cloned) during whole program analysis"); +STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " + "during whole program analysis"); + static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -95,6 +105,13 @@ /// behavior of an allocation based on its context. void identifyClones(); + /// Assign callsite clones to functions, cloning functions as needed to + /// accommodate the combinations of their callsite clones reached by callers. + /// For regular LTO this clones functions and callsites in the IR, but for + /// ThinLTO the cloning decisions are noted in the summaries and applied + /// later. + bool assignFunctions(); + void dump() const; void print(raw_ostream &OS) const; @@ -375,6 +392,28 @@ return static_cast(this)->getLastStackId(Call); } + /// Update the allocation call to record type of allocated memory. + void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { + AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; + static_cast(this)->updateAllocationCall(Call, AllocType); + } + + /// Update non-allocation call to invoke (possibly cloned) function + /// CalleeFunc. + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { + static_cast(this)->updateCall(CallerCall, CalleeFunc); + } + + /// Clone the given function for the given callsite, recording mapping of all + /// of the functions tracked calls to their new versions in the CallMap. + /// Assigns new clones to clone number CloneNo. + FuncInfo cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + return static_cast(this)->cloneFunctionForCallsite( + Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); + } + /// Gets a label to use in the dot graph for the given call clone in the given /// function. std::string getLabel(const FuncTy *Func, const CallTy Call, @@ -469,7 +508,9 @@ : public CallsiteContextGraph { public: - ModuleCallsiteContextGraph(Module &M); + ModuleCallsiteContextGraph( + Module &M, + function_ref OREGetter); private: friend CallsiteContextGraph getStackIdsWithContextNodesForCall(Instruction *Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const; const Module &Mod; + function_ref OREGetter; }; /// Represents a call in the summary index graph, which can either be an @@ -527,6 +577,14 @@ bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, unsigned CloneNo) const; @@ -1282,10 +1340,14 @@ return Index.getStackIdAtIndex(CallsiteContext.back()); } +static const std::string MemProfCloneSuffix = ".memprof."; + static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { + // We use CloneNo == 0 to refer to the original version, which doesn't get + // renamed with a suffix. if (!CloneNo) return Base.str(); - return (Base + ".memprof." + Twine(CloneNo)).str(); + return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); } std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, @@ -1347,7 +1409,9 @@ return StackIds; } -ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) { +ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( + Module &M, function_ref OREGetter) + : Mod(M), OREGetter(OREGetter) { for (auto &F : M) { std::vector CallsWithMetadata; for (auto &BB : F) { @@ -1661,7 +1725,7 @@ template static void checkNode(const ContextNode *Node, - bool CheckEdges = false) { + bool CheckEdges = true) { if (Node->isRemoved()) return; // Node's context ids should be the union of both its callee and caller edge @@ -1701,7 +1765,7 @@ void CallsiteContextGraph::check() const { using GraphType = const CallsiteContextGraph *; for (const auto Node : nodes(this)) { - checkNode(Node); + checkNode(Node, /*CheckEdges=*/false); for (auto &Edge : Node->CallerEdges) checkEdge(Edge); } @@ -1925,12 +1989,14 @@ NewEdge->Callee->CallerEdges.push_back(NewEdge); } if (VerifyCCG) { - checkNode(OldCallee); - checkNode(NewCallee); + checkNode(OldCallee, /*CheckEdges=*/false); + checkNode(NewCallee, /*CheckEdges=*/false); for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) - checkNode(OldCalleeEdge->Callee); + checkNode(OldCalleeEdge->Callee, + /*CheckEdges=*/false); for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) - checkNode(NewCalleeEdge->Callee); + checkNode(NewCalleeEdge->Callee, + /*CheckEdges=*/false); } } @@ -1945,7 +2011,7 @@ void CallsiteContextGraph::identifyClones( ContextNode *Node, DenseSet &Visited) { if (VerifyNodes) - checkNode(Node, /*CheckEdges=*/true); + checkNode(Node); assert(!Node->CloneOf); // If Node as a null call, then either it wasn't found in the module (regular @@ -2099,7 +2165,7 @@ for (auto *Clone : Node->Clones) { removeNoneTypeCalleeEdges(Clone); if (VerifyNodes) - checkNode(Clone, /*CheckEdges=*/true); + checkNode(Clone); } // We should still have some context ids on the original Node. assert(!Node->ContextIds.empty()); @@ -2120,7 +2186,595 @@ })); if (VerifyNodes) - checkNode(Node, /*CheckEdges=*/true); + checkNode(Node); +} + +static std::string getAllocTypeAttributeString(AllocationType Type) { + switch (Type) { + case AllocationType::NotCold: + return "notcold"; + break; + case AllocationType::Cold: + return "cold"; + break; + default: + dbgs() << "Unexpected alloc type " << (uint8_t)Type; + assert(false); + } + llvm_unreachable("invalid alloc type"); +} + +void ModuleCallsiteContextGraph::updateAllocationCall( + CallInfo &Call, AllocationType AllocType) { + std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), + "memprof", AllocTypeString); + cast(Call.call())->addFnAttr(A); + OREGetter(Call.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) + << ore::NV("AllocationCall", Call.call()) << " in clone " + << ore::NV("Caller", Call.call()->getFunction()) + << " marked with memprof allocation attribute " + << ore::NV("Attribute", AllocTypeString)); +} + +void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, + AllocationType AllocType) { + auto *AI = Call.call().dyn_cast(); + assert(AI); + assert(AI->Versions.size() > Call.cloneNo()); + AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; +} + +void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + if (CalleeFunc.cloneNo() > 0) + cast(CallerCall.call())->setCalledFunction(CalleeFunc.func()); + OREGetter(CallerCall.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) + << ore::NV("Call", CallerCall.call()) << " in clone " + << ore::NV("Caller", CallerCall.call()->getFunction()) + << " assigned to call function clone " + << ore::NV("Callee", CalleeFunc.func())); +} + +void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + auto *CI = CallerCall.call().dyn_cast(); + assert(CI && + "Caller cannot be an allocation which should not have profiled calls"); + assert(CI->Clones.size() > CallerCall.cloneNo()); + CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); +} + +CallsiteContextGraph::FuncInfo +ModuleCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Use existing LLVM facilities for cloning and obtaining Call in clone + ValueToValueMapTy VMap; + auto *NewFunc = CloneFunction(Func.func(), VMap); + std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); + assert(!Func.func()->getParent()->getFunction(Name)); + NewFunc->setName(Name); + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + CallMap[Inst] = {cast(VMap[Inst.call()]), CloneNo}; + } + OREGetter(Func.func()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) + << "created clone " << ore::NV("NewFunction", NewFunc)); + return {NewFunc, CloneNo}; +} + +CallsiteContextGraph::FuncInfo +IndexCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Check how many clones we have of Call (and therefore function). + // The next clone number is the current size of versions array. + // Confirm this matches the CloneNo provided by the caller, which is based on + // the number of function clones we have. + assert(CloneNo == + (Call.call().is() + ? Call.call().dyn_cast()->Versions.size() + : Call.call().dyn_cast()->Clones.size())); + // Walk all the instructions in this function. Create a new version for + // each (by adding an entry to the Versions/Clones summary array), and copy + // over the version being called for the function clone being cloned here. + // Additionally, add an entry to the CallMap for the new function clone, + // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) + // to the new call clone. + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + if (auto *AI = Inst.call().dyn_cast()) { + assert(AI->Versions.size() == CloneNo); + // We assign the allocation type later (in updateAllocationCall), just add + // an entry for it here. + AI->Versions.push_back(0); + } else { + auto *CI = Inst.call().dyn_cast(); + assert(CI && CI->Clones.size() == CloneNo); + // We assign the clone number later (in updateCall), just add an entry for + // it here. + CI->Clones.push_back(0); + } + CallMap[Inst] = {Inst.call(), CloneNo}; + } + return {Func.func(), CloneNo}; +} + +// This method assigns cloned callsites to functions, cloning the functions as +// needed. The assignment is greedy and proceeds roughly as follows: +// +// For each function Func: +// For each call with graph Node having clones: +// Initialize ClonesWorklist to Node and its clones +// Initialize NodeCloneCount to 0 +// While ClonesWorklist is not empty: +// Clone = pop front ClonesWorklist +// NodeCloneCount++ +// If Func has been cloned less than NodeCloneCount times: +// If NodeCloneCount is 1: +// Assign Clone to original Func +// Continue +// Create a new function clone +// If other callers not assigned to call a function clone yet: +// Assign them to call new function clone +// Continue +// Assign any other caller calling the cloned version to new clone +// +// For each caller of Clone: +// If caller is assigned to call a specific function clone: +// If we cannot assign Clone to that function clone: +// Create new callsite Clone NewClone +// Add NewClone to ClonesWorklist +// Continue +// Assign Clone to existing caller's called function clone +// Else: +// If Clone not already assigned to a function clone: +// Assign to first function clone without assignment +// Assign caller to selected function clone +template +bool CallsiteContextGraph::assignFunctions() { + bool Changed = false; + + // Keep track of the assignment of nodes (callsites) to function clones they + // call. + DenseMap CallsiteToCalleeFuncCloneMap; + + // Update caller node to call function version CalleeFunc, by recording the + // assignment in CallsiteToCalleeFuncCloneMap. + auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, + const FuncInfo &CalleeFunc) { + assert(Caller->hasCall()); + CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; + }; + + // Walk all functions for which we saw calls with memprof metadata, and handle + // cloning for each of its calls. + for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { + FuncInfo OrigFunc(Func); + // Map from each clone of OrigFunc to a map of remappings of each call of + // interest (from original uncloned call to the corresponding cloned call in + // that function clone). + std::map> FuncClonesToCallMap; + for (auto &Call : CallsWithMetadata) { + ContextNode *Node = getNodeForInst(Call); + // Skip call if we do not have a node for it (all uses of its stack ids + // were either on inlined chains or pruned from the MIBs), or if we did + // not create any clones for it. + if (!Node || Node->Clones.empty()) + continue; + assert(Node->hasCall() && + "Not having a call should have prevented cloning"); + + // Track the assignment of function clones to clones of the current + // callsite Node being handled. + std::map FuncCloneToCurNodeCloneMap; + + // Assign callsite version CallsiteClone to function version FuncClone, + // and also assign (possibly cloned) Call to CallsiteClone. + auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, + CallInfo &Call, + ContextNode *CallsiteClone, + bool IsAlloc) { + // Record the clone of callsite node assigned to this function clone. + FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; + + assert(FuncClonesToCallMap.count(FuncClone)); + std::map &CallMap = FuncClonesToCallMap[FuncClone]; + CallInfo CallClone(Call); + if (CallMap.count(Call)) + CallClone = CallMap[Call]; + CallsiteClone->setCall(CallClone); + }; + + // Keep track of the clones of callsite Node that need to be assigned to + // function clones. This list may be expanded in the loop body below if we + // find additional cloning is required. + std::deque ClonesWorklist; + // Ignore original Node if we moved all of its contexts to clones. + if (!Node->ContextIds.empty()) + ClonesWorklist.push_back(Node); + ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), + Node->Clones.end()); + + // Now walk through all of the clones of this callsite Node that we need, + // and determine the assignment to a corresponding clone of the current + // function (creating new function clones as needed). + unsigned NodeCloneCount = 0; + while (!ClonesWorklist.empty()) { + ContextNode *Clone = ClonesWorklist.front(); + ClonesWorklist.pop_front(); + NodeCloneCount++; + if (VerifyNodes) + checkNode(Clone); + + // Need to create a new function clone if we have more callsite clones + // than existing function clones, which would have been assigned to an + // earlier clone in the list (we assign callsite clones to function + // clones greedily). + if (FuncClonesToCallMap.size() < NodeCloneCount) { + // If this is the first callsite copy, assign to original function. + if (NodeCloneCount == 1) { + // Since FuncClonesToCallMap is empty in this case, no clones have + // been created for this function yet, and no callers should have + // been assigned a function clone for this callee node yet. + assert(llvm::none_of( + Clone->CallerEdges, [&](const std::shared_ptr &E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + })); + // Initialize with empty call map, assign Clone to original function + // and its callers, and skip to the next clone. + FuncClonesToCallMap[OrigFunc] = {}; + AssignCallsiteCloneToFuncClone( + OrigFunc, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto &CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); + } + continue; + } + + // First locate which copy of OrigFunc to clone again. If a caller + // of this callsite clone was already assigned to call a particular + // function clone, we need to redirect all of those callers to the + // new function clone, and update their other callees within this + // function. + FuncInfo PreviousAssignedFuncClone; + auto EI = llvm::find_if( + Clone->CallerEdges, [&](const std::shared_ptr &E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + }); + bool CallerAssignedToCloneOfFunc = false; + if (EI != Clone->CallerEdges.end()) { + const std::shared_ptr &Edge = *EI; + PreviousAssignedFuncClone = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + CallerAssignedToCloneOfFunc = true; + } + + // Clone function and save it along with the CallInfo map created + // during cloning in the FuncClonesToCallMap. + std::map NewCallMap; + unsigned CloneNo = FuncClonesToCallMap.size(); + assert(CloneNo > 0 && "Clone 0 is the original function, which " + "should already exist in the map"); + FuncInfo NewFuncClone = cloneFunctionForCallsite( + OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); + FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FunctionClonesAnalysis++; + Changed = true; + + // If no caller callsites were already assigned to a clone of this + // function, we can simply assign this clone to the new func clone + // and update all callers to it, then skip to the next clone. + if (!CallerAssignedToCloneOfFunc) { + AssignCallsiteCloneToFuncClone( + NewFuncClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto &CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + } + continue; + } + + // We may need to do additional node cloning in this case. + // Reset the CallsiteToCalleeFuncCloneMap entry for any callers + // that were previously assigned to call PreviousAssignedFuncClone, + // to record that they now call NewFuncClone. + for (auto CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + + if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || + // We subsequently fall through to later handling that + // will perform any additional cloning required for + // callers that were calling other function clones. + CallsiteToCalleeFuncCloneMap[CE->Caller] != + PreviousAssignedFuncClone) + continue; + + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + + // If we are cloning a function that was already assigned to some + // callers, then essentially we are creating new callsite clones + // of the other callsites in that function that are reached by those + // callers. Clone the other callees of the current callsite's caller + // that were already assigned to PreviousAssignedFuncClone + // accordingly. This is important since we subsequently update the + // calls from the nodes in the graph and their assignments to callee + // functions recorded in CallsiteToCalleeFuncCloneMap. + for (auto CalleeEdge : CE->Caller->CalleeEdges) { + // Skip any that have been removed on an earlier iteration when + // cleaning up newly None type callee edges. + if (!CalleeEdge) + continue; + ContextNode *Callee = CalleeEdge->Callee; + // Skip the current callsite, we are looking for other + // callsites Caller calls, as well as any that does not have a + // recorded callsite Call. + if (Callee == Clone || !Callee->hasCall()) + continue; + ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); + removeNoneTypeCalleeEdges(NewClone); + // Moving the edge may have resulted in some none type + // callee edges on the original Callee. + removeNoneTypeCalleeEdges(Callee); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + // If the Callee node was already assigned to call a specific + // function version, make sure its new clone is assigned to call + // that same function clone. + if (CallsiteToCalleeFuncCloneMap.count(Callee)) + RecordCalleeFuncOfCallsite( + NewClone, CallsiteToCalleeFuncCloneMap[Callee]); + // Update NewClone with the new Call clone of this callsite's Call + // created for the new function clone created earlier. + // Recall that we have already ensured when building the graph + // that each caller can only call callsites within the same + // function, so we are guaranteed that Callee Call is in the + // current OrigFunc. + // CallMap is set up as indexed by original Call at clone 0. + CallInfo OrigCall(Callee->getOrigNode()->Call); + OrigCall.setCloneNo(0); + std::map &CallMap = + FuncClonesToCallMap[NewFuncClone]; + assert(CallMap.count(OrigCall)); + CallInfo NewCall(CallMap[OrigCall]); + assert(NewCall); + NewClone->setCall(NewCall); + } + } + // Fall through to handling below to perform the recording of the + // function for this callsite clone. This enables handling of cases + // where the callers were assigned to different clones of a function. + } + + // See if we can use existing function clone. Walk through + // all caller edges to see if any have already been assigned to + // a clone of this callsite's function. If we can use it, do so. If not, + // because that function clone is already assigned to a different clone + // of this callsite, then we need to clone again. + // Basically, this checking is needed to handle the case where different + // caller functions/callsites may need versions of this function + // containing different mixes of callsite clones across the different + // callsites within the function. If that happens, we need to create + // additional function clones to handle the various combinations. + // + // Keep track of any new clones of this callsite created by the + // following loop, as well as any existing clone that we decided to + // assign this clone to. + std::map FuncCloneToNewCallsiteCloneMap; + FuncInfo FuncCloneAssignedToCurCallsiteClone; + // We need to be able to remove Edge from CallerEdges, so need to adjust + // iterator in the loop. + for (auto EI = Clone->CallerEdges.begin(); + EI != Clone->CallerEdges.end();) { + auto Edge = *EI; + // Ignore any caller that does not have a recorded callsite Call. + if (!Edge->Caller->hasCall()) { + EI++; + continue; + } + // If this caller already assigned to call a version of OrigFunc, need + // to ensure we can assign this callsite clone to that function clone. + if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { + FuncInfo FuncCloneCalledByCaller = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + // First we need to confirm that this function clone is available + // for use by this callsite node clone. + // + // While FuncCloneToCurNodeCloneMap is built only for this Node and + // its callsite clones, one of those callsite clones X could have + // been assigned to the same function clone called by Edge's caller + // - if Edge's caller calls another callsite within Node's original + // function, and that callsite has another caller reaching clone X. + // We need to clone Node again in this case. + if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && + FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != + Clone) || + // Detect when we have multiple callers of this callsite that + // have already been assigned to specific, and different, clones + // of OrigFunc (due to other unrelated callsites in Func they + // reach via call contexts). Is this Clone of callsite Node + // assigned to a different clone of OrigFunc? If so, clone Node + // again. + (FuncCloneAssignedToCurCallsiteClone && + FuncCloneAssignedToCurCallsiteClone != + FuncCloneCalledByCaller)) { + // We need to use a different newly created callsite clone, in + // order to assign it to another new function clone on a + // subsequent iteration over the Clones array (adjusted below). + // Note we specifically do not reset the + // CallsiteToCalleeFuncCloneMap entry for this caller, so that + // when this new clone is processed later we know which version of + // the function to copy (so that other callsite clones we have + // assigned to that function clone are properly cloned over). See + // comments in the function cloning handling earlier. + + // Check if we already have cloned this callsite again while + // walking through caller edges, for a caller calling the same + // function clone. If so, we can move this edge to that new clone + // rather than creating yet another new clone. + if (FuncCloneToNewCallsiteCloneMap.count( + FuncCloneCalledByCaller)) { + ContextNode *NewClone = + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; + moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); + // Cleanup any none type edges cloned over. + removeNoneTypeCalleeEdges(NewClone); + } else { + // Create a new callsite clone. + ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); + removeNoneTypeCalleeEdges(NewClone); + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = + NewClone; + // Add to list of clones and process later. + ClonesWorklist.push_back(NewClone); + assert(EI == Clone->CallerEdges.end() || + Clone->AllocTypes != (uint8_t)AllocationType::None); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + } + // Moving the caller edge may have resulted in some none type + // callee edges. + removeNoneTypeCalleeEdges(Clone); + // We will handle the newly created callsite clone in a subsequent + // iteration over this Node's Clones. Continue here since we + // already adjusted iterator EI while moving the edge. + continue; + } + + // Otherwise, we can use the function clone already assigned to this + // caller. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; + // Assign Clone to FuncCloneCalledByCaller + AssignCallsiteCloneToFuncClone( + FuncCloneCalledByCaller, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + // Don't need to do anything - callsite is already calling this + // function clone. + assert(FuncCloneAssignedToCurCallsiteClone == + FuncCloneCalledByCaller); + + } else { + // We have not already assigned this caller to a version of + // OrigFunc. Do the assignment now. + + // First check if we have already assigned this callsite clone to a + // clone of OrigFunc for another caller during this iteration over + // its caller edges. + if (!FuncCloneAssignedToCurCallsiteClone) { + // Find first function in FuncClonesToCallMap without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncClonesToCallMap size was smaller than the clone number. + for (auto &CF : FuncClonesToCallMap) { + if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { + FuncCloneAssignedToCurCallsiteClone = CF.first; + break; + } + } + assert(FuncCloneAssignedToCurCallsiteClone); + // Assign Clone to FuncCloneAssignedToCurCallsiteClone + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + assert(FuncCloneToCurNodeCloneMap + [FuncCloneAssignedToCurCallsiteClone] == Clone); + // Update callers to record function version called. + RecordCalleeFuncOfCallsite(Edge->Caller, + FuncCloneAssignedToCurCallsiteClone); + } + + EI++; + } + } + if (VerifyCCG) { + checkNode(Node); + for (const auto &PE : Node->CalleeEdges) + checkNode(PE->Callee); + for (const auto &CE : Node->CallerEdges) + checkNode(CE->Caller); + for (auto *Clone : Node->Clones) { + checkNode(Clone); + for (const auto &PE : Clone->CalleeEdges) + checkNode(PE->Callee); + for (const auto &CE : Clone->CallerEdges) + checkNode(CE->Caller); + } + } + } + } + + auto UpdateCalls = [&](ContextNode *Node, + DenseSet &Visited, + auto &&UpdateCalls) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + + for (auto *Clone : Node->Clones) + UpdateCalls(Clone, Visited, UpdateCalls); + + for (auto &Edge : Node->CallerEdges) + UpdateCalls(Edge->Caller, Visited, UpdateCalls); + + // Skip if either no call to update, or if we ended up with no context ids + // (we moved all edges onto other clones). + if (!Node->hasCall() || Node->ContextIds.empty()) + return; + + if (Node->IsAllocation) { + updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + return; + } + + if (!CallsiteToCalleeFuncCloneMap.count(Node)) + return; + + auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; + updateCall(Node->Call, CalleeFunc); + }; + + // Sort the allocation nodes based on the OrigStackOrAllocId, which increase + // in insertion order, so that the following loop is deterministic (since the + // AllocationCallToContextNodeMap is keyed by a pointer). Specifically this + // can affect the order of the remarks emitted for regular LTO IR updates + // during the call updating. + std::vector AllocationNodes; + AllocationNodes.reserve(AllocationCallToContextNodeMap.size()); + for (auto &Entry : AllocationCallToContextNodeMap) + AllocationNodes.push_back(Entry.second); + std::sort(AllocationNodes.begin(), AllocationNodes.end(), + [](const ContextNode *A, const ContextNode *B) { + return A->OrigStackOrAllocId < B->OrigStackOrAllocId; + }); + + // Performs DFS traversal starting from allocation nodes to update calls to + // reflect cloning decisions recorded earlier. For regular LTO this will + // update the actual calls in the IR to call the appropriate function clone + // (and add attributes to allocation calls), whereas for ThinLTO the decisions + // are recorded in the summary entries. + DenseSet Visited; + for (auto *AllocNode : AllocationNodes) + UpdateCalls(AllocNode, Visited, UpdateCalls); + + return Changed; } template @@ -2149,13 +2803,24 @@ if (ExportToDot) exportToDot("cloned"); - return false; + bool Changed = assignFunctions(); + + if (DumpCCG) { + dbgs() << "CCG after assigning function clones:\n"; + dbgs() << *this; + } + if (ExportToDot) + exportToDot("clonefuncassign"); + + return Changed; } -bool MemProfContextDisambiguation::processModule(Module &M) { +bool MemProfContextDisambiguation::processModule( + Module &M, + function_ref OREGetter) { bool Changed = false; - ModuleCallsiteContextGraph CCG(M); + ModuleCallsiteContextGraph CCG(M, OREGetter); Changed = CCG.process(); return Changed; @@ -2163,7 +2828,11 @@ PreservedAnalyses MemProfContextDisambiguation::run(Module &M, ModuleAnalysisManager &AM) { - if (!processModule(M)) + auto &FAM = AM.getResult(M).getManager(); + auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { + return FAM.getResult(*F); + }; + if (!processModule(M, OREGetter)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -39,13 +39,35 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should have cloned bar, baz, and foo, for the cold memory allocation. +; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + source_filename = "memprof-basic.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -227,6 +249,11 @@ ; DUMP: Clone of [[BAR]] +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; @@ -258,3 +285,9 @@ ; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; ; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; ; DOTCLONED: } + + +; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1) +; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1) +; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -60,7 +61,9 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -68,6 +71,27 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST +;; We should clone D once for the cold allocations via C. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + source_filename = "duplicate-context-ids.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -104,7 +128,13 @@ ret ptr null } -declare i32 @main() +define i32 @main() { +entry: + call ptr @_Z1Bv() + call ptr @_Z1Ev() + call ptr @_Z1Fv() + ret i32 0 +} declare void @_ZdaPv() @@ -268,6 +298,11 @@ ; DUMP: Clone of [[D]] +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; ; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; @@ -305,3 +340,9 @@ ; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"]; ; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; ; DOTCLONED: } + +; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1) +; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1) +; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0) +; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll @@ -0,0 +1,232 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + + +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline optnone +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) { +entry: + %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 + %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 + ret void +} + +declare ptr @_Znam(i64) + +define internal void @_Z1BPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 + ret void +} + +define internal void @_Z1CPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 + ret void +} + +; Function Attrs: noinline optnone +define i32 @main() { +entry: + call void @_Z1BPPcS0_() + call void @_Z1CPPcS0_() + call void @_Z1DPPcS0_() + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0 } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[C]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -61,7 +61,9 @@ ; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -69,6 +71,26 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should only create a single clone of foo, for the direct call +;; from main allocating cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + source_filename = "indirectcall.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -359,6 +381,11 @@ ; DUMP: Clone of [[FOO]] +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"]; diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -48,7 +49,9 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -56,6 +59,24 @@ ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should create clones for foo and bar for the call from main to allocate +;; cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + source_filename = "inlined.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -257,6 +278,11 @@ ; DUMP: Clone of [[BAR]] +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -1,5 +1,5 @@ ;; Test callsite context graph generation for simple call graph with -;; two memprof contexts and no inlining. +;; two memprof contexts and no inlining, as well as graph and IR cloning. ;; ;; Original code looks like: ;; @@ -34,7 +34,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -222,6 +224,48 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv +; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -55,7 +56,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -263,6 +266,39 @@ ; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 ; DUMP: Clone of [[D]] +; REMARKS: created clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Dv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll @@ -0,0 +1,244 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 + ret void +} + +declare ptr @_Znam(i64) #1 + +define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 + ret void +} + +; Function Attrs: noinline +define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +declare i32 @sleep() #5 + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0 } + +attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #1 = { "no-trapping-math"="true" } +attributes #2 = { noinline } +attributes #3 = { "frame-pointer"="all" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { "disable-tail-calls"="true" } +attributes #6 = { builtin } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[C]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.memprof.1 +; REMARKS: created clone _Z1EPPcS0_.memprof.2 +; REMARKS: created clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_ +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.2( +;; Transient clone that will get removed as it ends up with no callers. +;; Its calls to new never get updated with a memprof attribute as a result. +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[DEFAULT]] = { builtin } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -54,7 +54,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -340,6 +342,41 @@ ; DUMP: Clone of [[FOO]] +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold + + +; IR: define {{.*}} @main( +; IR: call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_ZN1A1xEv( +; IR: call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_ZN1B1xEv( +; IR: call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -43,7 +44,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -251,6 +254,42 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold + + +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3barv() +; IR: define {{.*}} @main() +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];