diff --git a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h --- a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h +++ b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h @@ -99,6 +99,10 @@ ImmutablePass * createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index); +/// Returns true if the instruction could have memprof metadata, used to ensure +/// consistency between summary analysis and the ThinLTO backend processing. +bool mayHaveMemprofSummary(const CallBase *CB); + } // end namespace llvm #endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1559,6 +1559,11 @@ Attrs = Attrs.removeFnAttribute(getContext(), Kind); } + /// Removes the attribute from the function + void removeFnAttr(StringRef Kind) { + Attrs = Attrs.removeFnAttribute(getContext(), Kind); + } + /// Removes the attribute from the return value void removeRetAttr(Attribute::AttrKind Kind) { Attrs = Attrs.removeRetAttribute(getContext(), Kind); diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1305,6 +1305,10 @@ /// Indicates that summary-based synthetic entry count propagation has run bool HasSyntheticEntryCounts = false; + /// Indicates that summary-based profile guided heap optimization context + /// disambigution has run. + bool WithMemProfContextDisambiguation = false; + /// Indicates that distributed backend should skip compilation of the /// module. Flag is suppose to be set by distributed ThinLTO indexing /// when it detected that the module is not needed during the final @@ -1513,6 +1517,13 @@ bool hasSyntheticEntryCounts() const { return HasSyntheticEntryCounts; } void setHasSyntheticEntryCounts() { HasSyntheticEntryCounts = true; } + bool withMemProfContextDisambiguation() const { + return WithMemProfContextDisambiguation; + } + void setWithMemProfContextDisambiguation() { + WithMemProfContextDisambiguation = true; + } + bool skipModuleByDistributedBackend() const { return SkipModuleByDistributedBackend; } diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -18,21 +18,36 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringSet.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/PassManager.h" #include namespace llvm { class GlobalValueSummary; class Module; -class ModuleSummaryIndex; +class OptimizationRemarkEmitter; class MemProfContextDisambiguation : public PassInfoMixin { - /// Run the context disambiguator on \p M, returns true if any changes made. - bool processModule(Module &M); + /// Run the context disambiguator on \p M, returns true if any changes + /// was made. + bool processModule( + Module &M, + function_ref OREGetter); + + /// In the ThinLTO backend, apply the cloning decisions in ImportSummary to + /// the IR. + bool applyImport(Module &M); + + /// Import summary containing cloning decisions for the ThinLTO backend. + const ModuleSummaryIndex *ImportSummary; + + // Owns the import summary specified by internal options for testing the + // ThinLTO backend via opt (to simulate distributed ThinLTO). + std::unique_ptr ImportSummaryForTesting; public: - MemProfContextDisambiguation() {} + MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr); PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -284,6 +284,10 @@ std::vector Callsites; std::vector Allocs; +#ifndef NDEBUG + DenseSet CallsThatMayHaveMemprofSummary; +#endif + bool HasInlineAsmMaybeReferencingInternal = false; bool HasIndirBranchToBlockAddress = false; bool HasUnknownCall = false; @@ -427,6 +431,10 @@ .updateHotness(getHotness(Candidate.Count, PSI)); } + // Summarize memprof related metadata. This is only needed for ThinLTO. + if (!IsThinLTO) + continue; + // TODO: Skip indirect calls for now. Need to handle these better, likely // by creating multiple Callsites, one per target, then speculatively // devirtualize while applying clone info in the ThinLTO backends. This @@ -437,6 +445,14 @@ if (!CalledFunction) continue; + // Ensure we keep this analysis in sync with the handling in the ThinLTO + // backend (see MemProfContextDisambiguation::applyImport). Save this call + // so that we can skip it in checking the reverse case later. + assert(mayHaveMemprofSummary(CB)); +#ifndef NDEBUG + CallsThatMayHaveMemprofSummary.insert(CB); +#endif + // Compute the list of stack ids first (so we can trim them from the stack // ids on any MIBs). CallStack InstCallsite( @@ -546,6 +562,25 @@ ? CalleeInfo::HotnessType::Cold : CalleeInfo::HotnessType::Critical); +#ifndef NDEBUG + // Make sure that all calls we decided could not have memprof summaries get a + // false value for mayHaveMemprofSummary, to ensure that this handling remains + // in sync with the ThinLTO backend handling. + if (IsThinLTO) { + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + const auto *CB = dyn_cast(&I); + if (!CB) + continue; + // We already checked these above. + if (CallsThatMayHaveMemprofSummary.count(CB)) + continue; + assert(!mayHaveMemprofSummary(CB)); + } + } + } +#endif + bool NonRenamableLocal = isNonRenamableLocal(F); bool NotEligibleForImport = NonRenamableLocal || HasInlineAsmMaybeReferencingInternal || @@ -1042,3 +1077,36 @@ INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info", "Module summary info", false, true) + +bool llvm::mayHaveMemprofSummary(const CallBase *CB) { + if (!CB) + return false; + if (CB->isDebugOrPseudoInst()) + return false; + auto *CI = dyn_cast(CB); + auto *CalledValue = CB->getCalledOperand(); + auto *CalledFunction = CB->getCalledFunction(); + if (CalledValue && !CalledFunction) { + CalledValue = CalledValue->stripPointerCasts(); + // Stripping pointer casts can reveal a called function. + CalledFunction = dyn_cast(CalledValue); + } + // Check if this is an alias to a function. If so, get the + // called aliasee for the checks below. + if (auto *GA = dyn_cast(CalledValue)) { + assert(!CalledFunction && + "Expected null called function in callsite for alias"); + CalledFunction = dyn_cast(GA->getAliaseeObject()); + } + // Check if this is a direct call to a known function or a known + // intrinsic, or an indirect call with profile data. + if (CalledFunction) { + if (CI && CalledFunction->isIntrinsic()) + return false; + } else { + // TODO: For now skip indirect calls. See comments in + // computeFunctionSummary for what is needed to handle this. + return false; + } + return true; +} diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8067,7 +8067,7 @@ case bitc::FS_FLAGS: { // [flags] uint64_t Flags = Record[0]; // Scan flags. - assert(Flags <= 0xff && "Unexpected bits in flag"); + assert(Flags <= 0x1ff && "Unexpected bits in flag"); return Flags & 0x8; } diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -107,11 +107,13 @@ Flags |= 0x40; if (withWholeProgramVisibility()) Flags |= 0x80; + if (withMemProfContextDisambiguation()) + Flags |= 0x100; return Flags; } void ModuleSummaryIndex::setFlags(uint64_t Flags) { - assert(Flags <= 0xff && "Unexpected bits in flag"); + assert(Flags <= 0x1ff && "Unexpected bits in flag"); // 1 bit: WithGlobalValueDeadStripping flag. // Set on combined index only. if (Flags & 0x1) @@ -145,6 +147,10 @@ // Set on combined index only. if (Flags & 0x80) setWithWholeProgramVisibility(); + // 1 bit: WithMemProfContextDisambiguation flag. + // Set on combined index only. + if (Flags & 0x100) + setWithMemProfContextDisambiguation(); } // Collect for the given module the list of function it defines diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1529,6 +1529,11 @@ ModulePassManager MPM; if (ImportSummary) { + // For ThinLTO we must apply the context disambiguation decisions early, to + // ensure we can correctly match the callsites to summary data. + if (EnableMemProfContextDisambiguation) + MPM.addPass(MemProfContextDisambiguation(ImportSummary)); + // These passes import type identifier resolutions for whole-program // devirtualization and CFI. They must run early because other passes may // disturb the specific instruction patterns that these passes look for, diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -81,13 +81,13 @@ MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass()) MODULE_PASS("lower-ifunc", LowerIFuncPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass()) +MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation()) MODULE_PASS("metarenamer", MetaRenamerPass()) MODULE_PASS("mergefunc", MergeFunctionsPass()) MODULE_PASS("name-anon-globals", NameAnonGlobalPass()) MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass()) MODULE_PASS("partial-inliner", PartialInlinerPass()) -MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation()) MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion()) MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen()) MODULE_PASS("pgo-instr-use", PGOInstrumentationUse()) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -27,8 +27,11 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -39,13 +42,42 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" #include #include using namespace llvm; using namespace llvm::memprof; +using namespace ore; #define DEBUG_TYPE "memprof-context-disambiguation" +STATISTIC(FunctionClonesAnalysis, + "Number of function clones created during whole program analysis"); +STATISTIC(FunctionClonesThinBackend, + "Number of function clones created during ThinLTO backend"); +STATISTIC(FunctionsClonedThinBackend, + "Number of functions that had clones created during ThinLTO backend"); +STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " + "cloned) during whole program analysis"); +STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " + "during whole program analysis"); +STATISTIC(AllocTypeNotColdThinBackend, + "Number of not cold static allocations (possibly cloned) during " + "ThinLTO backend"); +STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " + "(possibly cloned) during ThinLTO backend"); +STATISTIC(OrigAllocsThinBackend, + "Number of original (not cloned) allocations with memprof profiles " + "during ThinLTO backend"); +STATISTIC( + AllocVersionsThinBackend, + "Number of allocation versions (including clones) during ThinLTO backend"); +STATISTIC(MaxAllocVersionsThinBackend, + "Maximum number of allocation versions created for an original " + "allocation during ThinLTO backend"); +STATISTIC(UnclonableAllocsThinBackend, + "Number of unclonable ambigous allocations during ThinLTO backend"); + static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -67,6 +99,11 @@ VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes.")); +static cl::opt MemProfImportSummary( + "memprof-import-summary", + cl::desc("Import summary to use for testing the ThinLTO backend via opt"), + cl::Hidden); + /// CRTP base for graphs built from either IR or ThinLTO summary index. /// /// The graph represents the call contexts in all memprof metadata on allocation @@ -95,6 +132,13 @@ /// behavior of an allocation based on its context. void identifyClones(); + /// Assign callsite clones to functions, cloning functions as needed to + /// accommodate the combinations of their callsite clones reached by callers. + /// For regular LTO this clones functions and callsites in the IR, but for + /// ThinLTO the cloning decisions are noted in the summaries and later applied + /// in applyImport. + bool assignFunctions(); + void dump() const; void print(raw_ostream &OS) const; @@ -375,6 +419,28 @@ return static_cast(this)->getLastStackId(Call); } + /// Update the allocation call to record type of allocated memory. + void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { + AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; + static_cast(this)->updateAllocationCall(Call, AllocType); + } + + /// Update non-allocation call to invoke (possibly cloned) function + /// CalleeFunc. + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { + static_cast(this)->updateCall(CallerCall, CalleeFunc); + } + + /// Clone the given function for the given callsite, recording mapping of all + /// of the functions tracked calls to their new versions in the CallMap. + /// Assigns new clones to clone number CloneNo. + FuncInfo cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + return static_cast(this)->cloneFunctionForCallsite( + Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); + } + /// Gets a label to use in the dot graph for the given call clone in the given /// function. std::string getLabel(const FuncTy *Func, const CallTy Call, @@ -469,7 +535,9 @@ : public CallsiteContextGraph { public: - ModuleCallsiteContextGraph(Module &M); + ModuleCallsiteContextGraph( + Module &M, + function_ref OREGetter); private: friend CallsiteContextGraph getStackIdsWithContextNodesForCall(Instruction *Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const; const Module &Mod; + function_ref OREGetter; }; /// Represents a call in the summary index graph, which can either be an @@ -527,6 +604,14 @@ bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, unsigned CloneNo) const; @@ -1282,10 +1367,12 @@ return Index.getStackIdAtIndex(CallsiteContext.back()); } +static const std::string MemProfCloneSuffix = ".memprof."; + static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { if (!CloneNo) return Base.str(); - return (Base + ".memprof." + Twine(CloneNo)).str(); + return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); } std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, @@ -1347,7 +1434,9 @@ return StackIds; } -ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) { +ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( + Module &M, function_ref OREGetter) + : Mod(M), OREGetter(OREGetter) { for (auto &F : M) { std::vector CallsWithMetadata; for (auto &BB : F) { @@ -2123,6 +2212,925 @@ checkNode(Node, /*CheckEdges=*/true); } +static std::string getAllocTypeAttributeString(AllocationType Type) { + switch (Type) { + case AllocationType::NotCold: + return "notcold"; + break; + case AllocationType::Cold: + return "cold"; + break; + default: + dbgs() << "Unexpected alloc type " << (uint8_t)Type; + assert(false); + } + llvm_unreachable("invalid alloc type"); +} + +void ModuleCallsiteContextGraph::updateAllocationCall( + CallInfo &Call, AllocationType AllocType) { + std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), + "memprof", AllocTypeString); + cast(Call.call())->addFnAttr(A); + OREGetter(Call.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) + << NV("AllocationCall", Call.call()) << " in clone " + << NV("Caller", Call.call()->getFunction()) + << " marked with memprof allocation attribute " + << NV("Attribute", AllocTypeString)); +} + +void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, + AllocationType AllocType) { + auto *AI = Call.call().dyn_cast(); + assert(AI); + assert(AI->Versions.size() > Call.cloneNo()); + AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; +} + +void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + if (CalleeFunc.cloneNo() > 0) + cast(CallerCall.call())->setCalledFunction(CalleeFunc.func()); + OREGetter(CallerCall.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) + << NV("Call", CallerCall.call()) << " in clone " + << NV("Caller", CallerCall.call()->getFunction()) + << " assigned to call function clone " + << NV("Callee", CalleeFunc.func())); +} + +void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + auto *CI = CallerCall.call().dyn_cast(); + // Caller cannot be an allocation. + assert(CI); + assert(CI->Clones.size() > CallerCall.cloneNo()); + CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); +} + +CallsiteContextGraph::FuncInfo +ModuleCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Use existing LLVM facilities for cloning and obtaining Call in clone + ValueToValueMapTy VMap; + auto *NewFunc = CloneFunction(Func.func(), VMap); + std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); + assert(!Func.func()->getParent()->getFunction(Name)); + NewFunc->setName(Name); + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + CallMap[Inst] = {cast(VMap[Inst.call()]), CloneNo}; + } + OREGetter(Func.func()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) + << "created clone " << NV("NewFunction", NewFunc)); + return {NewFunc, CloneNo}; +} + +CallsiteContextGraph::FuncInfo +IndexCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Check how many clones we have of Call (and therefore function). + // The next clone number is the current size of versions array. + // Confirm this matches the CloneNo provided by the caller, which is based on + // the number of function clones we have. + assert(CloneNo == + (Call.call().is() + ? Call.call().dyn_cast()->Versions.size() + : Call.call().dyn_cast()->Clones.size())); + // Walk all the instructions in this function. Create a new version for + // each (by adding an entry to the Versions/Clones summary array), and copy + // over the version being called for the function clone being cloned here. + // Additionally, add an entry to the CallMap for the new function clone, + // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) + // to the new call clone. + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + if (auto *AI = Inst.call().dyn_cast()) { + assert(AI->Versions.size() == CloneNo); + // We assign the allocation type later (in updateAllocationCall), just add + // an entry for it here. + AI->Versions.push_back(0); + } else { + auto *CI = Inst.call().dyn_cast(); + assert(CI && CI->Clones.size() == CloneNo); + // We assign the clone number later (in updateCall), just add an entry for + // it here. + CI->Clones.push_back(0); + } + CallMap[Inst] = {Inst.call(), CloneNo}; + } + return {Func.func(), CloneNo}; +} + +template +bool CallsiteContextGraph::assignFunctions() { + bool Changed = false; + + // Keep track of the assignment of nodes (callsites) to function clones they + // call. + std::map CallsiteToCalleeFuncCloneMap; + + // Update caller node to call function version CalleeFunc, by recording the + // assignment in CallsiteToCalleeFuncCloneMap. + auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, + const FuncInfo &CalleeFunc) { + assert(Caller->hasCall()); + CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; + }; + + // Walk all functions for which we saw calls with memprof metadata, and handle + // cloning for each of its calls. + for (auto &FuncEntry : FuncToCallsWithMetadata) { + FuncInfo OrigFunc(FuncEntry.first); + // Map from each clone of OrigFunc to a map of remappings of each call of + // interest (from original uncloned call to the corresponding cloned call in + // that function clone). + std::map> FuncClonesToCallMap; + for (auto &Call : FuncEntry.second) { + ContextNode *Node = getNodeForInst(Call); + // Skip call if we do not have a node for it (all uses of its stack ids + // were either on inlined chains or pruned from the MIBs), or if we did + // not create any clones for it. + if (!Node || Node->Clones.empty()) + continue; + // Not having a call should have prevented cloning. + assert(Node->hasCall()); + + // Track the assignment of function clones to clones of the current + // callsite Node being handled. + std::map FuncCloneToCurNodeCloneMap; + + // Assign callsite version CallsiteClone to function version FuncClone, + // and also assign (possibly cloned) Call to CallsiteClone. + auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, + CallInfo &Call, + ContextNode *CallsiteClone, + bool IsAlloc) { + // Record the clone of callsite node assigned to this function clone. + FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; + + assert(FuncClonesToCallMap.count(FuncClone)); + std::map &CallMap = FuncClonesToCallMap[FuncClone]; + CallInfo CallClone(Call); + if (CallMap.count(Call)) + CallClone = CallMap[Call]; + CallsiteClone->setCall(CallClone); + }; + + // Keep track of the clones of callsite Node that need to be assigned to + // function clones. This list may be expanded in the loop body below if we + // find additional cloning is required. + std::vector Clones(Node->Clones); + // Ignore original Node if we moved all of its contexts to clones. + if (!Node->ContextIds.empty()) + Clones.insert(Clones.begin(), Node); + + // Now walk through all of the clones of this callsite Node that we need, + // and determine the assignment to a corresponding clone of the current + // function (creating new function clones as needed). + for (unsigned I = 0; I < Clones.size(); I++) { + ContextNode *Clone = Clones[I]; + if (VerifyNodes) + checkNode(Clone, /*CheckEdges=*/true); + + // Need to create a new function clone if we have more callsite clones + // than existing function clones, which would have been assigned to an + // earlier clone in the list (we assign callsite clones to function + // clones greedily). + if (FuncClonesToCallMap.size() <= I) { + // If this is the first callsite copy, assign to original function. + if (I == 0) { + // Since FuncClonesToCallMap is empty in this case, no clones have + // been created for this function yet, and no callers should have + // been assigned a function clone for this callee node yet. + assert(llvm::none_of( + Clone->CallerEdges, [&](const std::shared_ptr &E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + })); + // Initialize with empty call map, assign Clone to original function + // and its callers, and skip to the next clone. + FuncClonesToCallMap[OrigFunc] = {}; + AssignCallsiteCloneToFuncClone( + OrigFunc, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); + } + continue; + } + + // First locate which copy of OrigFunc to clone again. If a caller + // of this callsite clone was already assigned to call a particular + // function clone, we need to redirect all of those callers to the + // new function clone, and update their other callees within this + // function. + FuncInfo PreviousAssignedFuncClone; + auto EI = llvm::find_if( + Clone->CallerEdges, [&](const std::shared_ptr &E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + }); + bool CallerAssignedToCloneOfFunc = false; + if (EI != Clone->CallerEdges.end()) { + const std::shared_ptr &Edge = *EI; + PreviousAssignedFuncClone = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + CallerAssignedToCloneOfFunc = true; + } + + // Clone function and save it along with the CallInfo map created + // during cloning in the FuncClonesToCallMap. + std::map NewCallMap; + unsigned CloneNo = FuncClonesToCallMap.size(); + // Clone 0 is the original function, which should already exist in the + // map. + assert(CloneNo > 0); + FuncInfo NewFuncClone = cloneFunctionForCallsite( + OrigFunc, Call, NewCallMap, FuncEntry.second, CloneNo); + FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FunctionClonesAnalysis++; + Changed = true; + + // If no caller callsites were already assigned to a clone of this + // function, we can simply assign this clone to the new func clone + // and update all callers to it, then skip to the next clone. + if (!CallerAssignedToCloneOfFunc) { + AssignCallsiteCloneToFuncClone( + NewFuncClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + } + continue; + } + + // We may need to do additional node cloning in this case. + // Reset the CallsiteToCalleeFuncCloneMap entry for any callers + // that were previously assigned to call PreviousAssignedFuncClone, + // to record that they now call NewFuncClone. + for (auto CE : Clone->CallerEdges) { + // Ignore any caller that does not have a recorded callsite Call. + if (!CE->Caller->hasCall()) + continue; + + if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || + // We subsequently fall through to later handling that + // will perform any additional cloning required for + // callers that were calling other function clones. + CallsiteToCalleeFuncCloneMap[CE->Caller] != + PreviousAssignedFuncClone) + continue; + + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + + // If we are cloning a function that was already assigned to some + // callers, then essentially we are creating new callsite clones + // of the other callsites in that function that are reached by those + // callers. Clone the other callees of the current callsite's caller + // that were already assigned to PreviousAssignedFuncClone + // accordingly. This is important since we subsequently update the + // calls from the nodes in the graph and their assignments to callee + // functions recorded in CallsiteToCalleeFuncCloneMap. + for (auto CalleeEdge : CE->Caller->CalleeEdges) { + // Skip any that have been removed on an earlier iteration when + // cleaning up newly None type callee edges. + if (!CalleeEdge) + continue; + ContextNode *Callee = CalleeEdge->Callee; + // Skip the current callsite, we are looking for other + // callsites Caller calls. + if (Callee == Clone) + continue; + if (!Callee->hasCall()) + continue; + ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); + removeNoneTypeCalleeEdges(NewClone); + // Moving the edge may have resulted in some none type + // callee edges on the original Callee. + removeNoneTypeCalleeEdges(Callee); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + // If the Callee node was already assigned to call a specific + // function version, make sure its new clone is assigned to call + // that same function clone. + if (CallsiteToCalleeFuncCloneMap.count(Callee)) + RecordCalleeFuncOfCallsite( + NewClone, CallsiteToCalleeFuncCloneMap[Callee]); + // Update NewClone with the new Call clone of this callsite's Call + // created for the new function clone created earlier. + // Recall that we have already ensured when building the graph + // that each caller can only call callsites within the same + // function, so we are guaranteed that Callee Call is in the + // current OrigFunc. + // CallMap is set up as indexed by original Call at clone 0. + CallInfo OrigCall(Callee->getOrigNode()->Call); + OrigCall.setCloneNo(0); + std::map &CallMap = + FuncClonesToCallMap[NewFuncClone]; + assert(CallMap.count(OrigCall)); + CallInfo NewCall(CallMap[OrigCall]); + assert(NewCall); + NewClone->setCall(NewCall); + } + } + // Fall through to handling below to perform the recording of the + // function for this callsite clone. This enables handling of cases + // where the callers were assigned to different clones of a function. + } + + // See if we can use existing function clone. Walk through + // all caller edges to see if any have already been assigned to + // a clone of this callsite's function. If we can use it, do so. If not, + // because that function clone is already assigned to a different clone + // of this callsite, then we need to clone again. + // Basically, this checking is needed to handle the case where different + // caller functions/callsites may need versions of this function + // containing different mixes of callsite clones across the different + // callsites within the function. If that happens, we need to create + // additional function clones to handle the various combinations. + // + // Keep track of any new clones of this callsite created by the + // following loop, as well as any existing clone that we decided to + // assign this clone to. + std::map FuncCloneToNewCallsiteCloneMap; + FuncInfo FuncCloneAssignedToCurCallsiteClone; + // We need to be able to remove Edge from CallerEdges, so need to adjust + // iterator in the loop. + for (auto EI = Clone->CallerEdges.begin(); + EI != Clone->CallerEdges.end();) { + auto Edge = *EI; + // Ignore any caller that does not have a recorded callsite Call. + if (!Edge->Caller->hasCall()) { + EI++; + continue; + } + // If this caller already assigned to call a version of OrigFunc, need + // to ensure we can assign this callsite clone to that function clone. + if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { + FuncInfo FuncCloneCalledByCaller = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + // First we need to confirm that this function clone is available + // for use by this callsite node clone. + // + // While FuncCloneToCurNodeCloneMap is built only for this Node and + // its callsite clones, one of those callsite clones X could have + // been assigned to the same function clone called by Edge's caller + // - if Edge's caller calls another callsite within Node's original + // function, and that callsite has another caller reaching clone X. + // We need to clone Node again in this case. + if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && + FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != + Clone) || + // Detect when we have multiple callers of this callsite that + // have already been assigned to specific, and different, clones + // of OrigFunc (due to other unrelated callsites in Func they + // reach via call contexts). Is this Clone of callsite Node + // assigned to a different clone of OrigFunc? If so, clone Node + // again. + (FuncCloneAssignedToCurCallsiteClone && + FuncCloneAssignedToCurCallsiteClone != + FuncCloneCalledByCaller)) { + // We need to use a different newly created callsite clone, in + // order to assign it to another new function clone on a + // subsequent iteration over the Clones array (adjusted below). + // Note we specifically do not reset the + // CallsiteToCalleeFuncCloneMap entry for this caller, so that + // when this new clone is processed later we know which version of + // the function to copy (so that other callsite clones we have + // assigned to that function clone are properly cloned over). See + // comments in the function cloning handling earlier. + + // Check if we already have cloned this callsite again while + // walking through caller edges, for a caller calling the same + // function clone. If so, we can move this edge to that new clone + // rather than creating yet another new clone. + if (FuncCloneToNewCallsiteCloneMap.count( + FuncCloneCalledByCaller)) { + ContextNode *NewClone = + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; + moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); + // Cleanup any none type edges cloned over. + removeNoneTypeCalleeEdges(NewClone); + } else { + // Create a new callsite clone. + ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); + removeNoneTypeCalleeEdges(NewClone); + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = + NewClone; + // Add to list of clones and process later. + Clones.push_back(NewClone); + assert(EI == Clone->CallerEdges.end() || + Clone->AllocTypes != (uint8_t)AllocationType::None); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + } + // Moving the caller edge may have resulted in some none type + // callee edges. + removeNoneTypeCalleeEdges(Clone); + // We will handle the newly created callsite clone in a subsequent + // iteration over this Node's Clones. Continue here since we + // already adjusted iterator EI while moving the edge. + continue; + } + + // Otherwise, we can use the function clone already assigned to this + // caller. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; + // Assign Clone to FuncCloneCalledByCaller + AssignCallsiteCloneToFuncClone( + FuncCloneCalledByCaller, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + // Don't need to do anything - callsite is already calling this + // function clone. + assert(FuncCloneAssignedToCurCallsiteClone == + FuncCloneCalledByCaller); + + } else { + // We have not already assigned this caller to a version of + // OrigFunc. Do the assignment now. + + // First check if we have already assigned this callsite clone to a + // clone of OrigFunc for another caller during this iteration over + // its caller edges. + if (!FuncCloneAssignedToCurCallsiteClone) { + // Find first function in FuncClonesToCallMap without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncClonesToCallMap size was smaller than the clone number. + for (auto &CF : FuncClonesToCallMap) { + if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { + FuncCloneAssignedToCurCallsiteClone = CF.first; + break; + } + } + assert(FuncCloneAssignedToCurCallsiteClone); + // Assign Clone to FuncCloneAssignedToCurCallsiteClone + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + assert(FuncCloneToCurNodeCloneMap + [FuncCloneAssignedToCurCallsiteClone] == Clone); + // Update callers to record function version called. + RecordCalleeFuncOfCallsite(Edge->Caller, + FuncCloneAssignedToCurCallsiteClone); + } + + EI++; + } + } + if (VerifyCCG) { + checkNode(Node, /*CheckEdges=*/true); + for (const auto &PE : Node->CalleeEdges) + checkNode(PE->Callee, + /*CheckEdges=*/true); + for (const auto &CE : Node->CallerEdges) + checkNode(CE->Caller, + /*CheckEdges=*/true); + for (unsigned I = 0; I < Clones.size(); I++) { + ContextNode *Clone = Clones[I]; + checkNode(Clone, /*CheckEdges=*/true); + for (const auto &PE : Clone->CalleeEdges) + checkNode(PE->Callee, + /*CheckEdges=*/true); + for (const auto &CE : Clone->CallerEdges) + checkNode(CE->Caller, + /*CheckEdges=*/true); + } + } + } + } + + auto UpdateCalls = [&](ContextNode *Node, + DenseSet &Visited, + auto &&UpdateCalls) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + + for (auto *Clone : Node->Clones) + UpdateCalls(Clone, Visited, UpdateCalls); + + for (auto &Edge : Node->CallerEdges) + UpdateCalls(Edge->Caller, Visited, UpdateCalls); + + // Skip if either no call to update, or if we ended up with no context ids + // (we moved all edges onto other clones). + if (!Node->hasCall() || Node->ContextIds.empty()) + return; + + if (Node->IsAllocation) { + updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + return; + } + + if (!CallsiteToCalleeFuncCloneMap.count(Node)) + return; + + auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; + updateCall(Node->Call, CalleeFunc); + }; + + DenseSet Visited; + for (auto &Entry : AllocationCallToContextNodeMap) + UpdateCalls(Entry.second, Visited, UpdateCalls); + + return Changed; +} + +static SmallVector, 4> createFunctionClones( + Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, + std::map> + &FuncToAliasMap) { + // The first "clone" is the original copy, we should only call this if we + // needed to create new clones. + assert(NumClones > 1); + SmallVector, 4> VMaps; + VMaps.reserve(NumClones - 1); + FunctionsClonedThinBackend++; + for (unsigned I = 1; I < NumClones; I++) { + VMaps.emplace_back(std::make_unique()); + auto *NewF = CloneFunction(&F, *VMaps.back()); + FunctionClonesThinBackend++; + // Strip memprof and callsite metadata from clone as they are no longer + // needed. + for (auto &BB : *NewF) { + for (auto &Inst : BB) { + Inst.setMetadata(LLVMContext::MD_memprof, nullptr); + Inst.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + std::string Name = getMemProfFuncName(F.getName(), I); + auto *PrevF = M.getFunction(Name); + if (PrevF) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevF->isDeclaration()); + NewF->takeName(PrevF); + PrevF->replaceAllUsesWith(NewF); + PrevF->eraseFromParent(); + } else + NewF->setName(Name); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) + << "created clone " << NV("NewFunction", NewF)); + + // Now handle aliases to this function, and clone those as well. + if (!FuncToAliasMap.count(&F)) + continue; + for (auto *A : FuncToAliasMap[&F]) { + std::string Name = getMemProfFuncName(A->getName(), I); + auto *PrevA = M.getNamedAlias(Name); + auto *NewA = GlobalAlias::create(A->getValueType(), + A->getType()->getPointerAddressSpace(), + A->getLinkage(), Name, NewF); + NewA->copyAttributesFrom(A); + if (PrevA) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevA->isDeclaration()); + NewA->takeName(PrevA); + PrevA->replaceAllUsesWith(NewA); + PrevA->eraseFromParent(); + } + } + } + return VMaps; +} + +// Locate the summary for F. This is complicated by the fact that it might +// have been internalized or promoted. +static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, + const ModuleSummaryIndex *ImportSummary) { + // FIXME: Ideally we would retain the original GUID in some fashion on the + // function (e.g. as metadata), but for now do our best to locate the + // summary without that information. + ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); + if (!TheFnVI) + // See if theFn was internalized, by checking index directly with + // original name (this avoids the name adjustment done by getGUID() for + // internal symbols). + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); + if (TheFnVI) + return TheFnVI; + // Now query with the original name before any promotion was performed. + StringRef OrigName = + ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); + std::string OrigId = GlobalValue::getGlobalIdentifier( + OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); + if (TheFnVI) + return TheFnVI; + // Could be a promoted local imported from another module. We need to pass + // down more info here to find the original module id. For now, try with + // the OrigName which might have been stored in the OidGuidMap in the + // index. This would not work if there were same-named locals in multiple + // modules, however. + auto OrigGUID = + ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); + if (OrigGUID) + TheFnVI = ImportSummary->getValueInfo(OrigGUID); + return TheFnVI; +} + +bool MemProfContextDisambiguation::applyImport(Module &M) { + assert(ImportSummary); + bool Changed = false; + if (!ImportSummary->withMemProfContextDisambiguation()) { + // The profile matcher applies hotness attributes directly for allocations, + // and those will cause us to generate calls to the hot/cold interfaces + // unconditionally. If context disambiguation was not enabled in the thin + // link then assume we don't want these calls (e.g. not linking with + // the appropriate library, or otherwise trying to disable this behavior). + // For now, simply strip existing hotness attributes so they aren't applied, + // and exit early since no cloning decisions were made. + for (auto &F : M) { + for (auto &BB : F) + for (auto &I : BB) { + auto *CI = dyn_cast(&I); + if (!CI) + continue; + if (CI->hasFnAttr("memprof")) { + CI->removeFnAttr("memprof"); + Changed = true; + } + // Strip off all memprof metadata as it is no longer needed. + // Importantly, this avoids the addition of new memprof attributes + // after inlining propagation. + CI->setMetadata(LLVMContext::MD_memprof, nullptr); + CI->setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + return Changed; + } + + auto IsMemProfClone = [](const Function &F) { + return F.getName().contains(MemProfCloneSuffix); + }; + + // We also need to clone any aliases that reference cloned functions, because + // the modified callsites may invoke via the alias. Keep track of the aliases + // for each function. + std::map> + FuncToAliasMap; + for (auto &A : M.aliases()) { + auto *Aliasee = A.getAliaseeObject(); + if (auto *F = dyn_cast(Aliasee)) + FuncToAliasMap[F].insert(&A); + } + + for (auto &F : M) { + if (F.isDeclaration() || IsMemProfClone(F)) + continue; + + OptimizationRemarkEmitter ORE(&F); + + SmallVector, 4> VMaps; + bool ClonesCreated = false; + unsigned NumClonesCreated = 0; + auto CloneFuncIfNeeded = [&](unsigned NumClones) { + // We should at least have version 0 which is the original copy. + assert(NumClones > 0); + // If only one copy needed use original. + if (NumClones == 1) + return; + // If we already performed cloning of this function, confirm that the + // requested number of clones matches (the thin link should ensure the + // number of clones for each constituent callsite is consistent within + // each function), before returning. + if (ClonesCreated) { + assert(NumClonesCreated == NumClones); + return; + } + VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); + // The first "clone" is the original copy, which doesn't have a VMap. + assert(VMaps.size() == NumClones - 1); + Changed = true; + ClonesCreated = true; + NumClonesCreated = NumClones; + }; + + // Locate the summary for F. + ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); + // If not found, this could be an imported local (see comment in + // findValueInfoForFunc). Skip for now as it will be cloned in its original + // module (where it would have been promoted to global scope so should + // satisfy any reference in this module). + if (!TheFnVI) + continue; + + auto *GVSummary = + ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); + if (!GVSummary) + // Must have been imported, use the first summary (might be multiple if + // this was a linkonce_odr). + GVSummary = TheFnVI.getSummaryList().front().get(); + + // If this was an imported alias skip it as we won't have the function + // summary, and it should be cloned in the original module. + if (isa(GVSummary)) + continue; + + auto *FS = cast(GVSummary->getBaseObject()); + + if (FS->allocs().empty() && FS->callsites().empty()) + continue; + + auto SI = FS->callsites().begin(); + auto AI = FS->allocs().begin(); + + // Assume for now that the instructions are in the exact same order + // as when the summary was created, but confirm this is correct by + // matching the stack ids. + for (auto &BB : F) { + for (auto &I : BB) { + auto *CB = dyn_cast(&I); + // Same handling as when creating module summary. + if (!mayHaveMemprofSummary(CB)) + continue; + + CallStack CallsiteContext( + I.getMetadata(LLVMContext::MD_callsite)); + auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); + + // Include allocs that were already assigned a memprof function + // attribute in the statistics. + if (CB->getAttributes().hasFnAttr("memprof")) { + assert(!MemProfMD); + CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + OrigAllocsThinBackend++; + AllocVersionsThinBackend++; + if (!MaxAllocVersionsThinBackend) + MaxAllocVersionsThinBackend = 1; + // Remove any remaining callsite metadata and we can skip the rest of + // the handling for this instruction, since no cloning needed. + I.setMetadata(LLVMContext::MD_callsite, nullptr); + continue; + } + + if (MemProfMD) { + // Consult the next alloc node. + assert(AI != FS->allocs().end()); + auto &AllocNode = *(AI++); + + // Sanity check that the MIB stack ids match between the summary and + // instruction metadata. + auto MIBIter = AllocNode.MIBs.begin(); + for (auto &MDOp : MemProfMD->operands()) { + assert(MIBIter != AllocNode.MIBs.end()); + auto StackIdIndexIter = MIBIter->StackIdIndices.begin(); + auto *MIBMD = cast(MDOp); + MDNode *StackMDNode = getMIBStackNode(MIBMD); + assert(StackMDNode); + SmallVector StackIdsFromMetadata; + CallStack StackContext(StackMDNode); + for (auto ContextIter = + StackContext.beginAfterSharedPrefix(CallsiteContext); + ContextIter != StackContext.end(); ++ContextIter) { + // If this is a direct recursion, simply skip the duplicate + // entries, to be consistent with how the summary ids were + // generated during ModuleSummaryAnalysis. + if (!StackIdsFromMetadata.empty() && + StackIdsFromMetadata.back() == *ContextIter) + continue; + assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + *ContextIter); + StackIdIndexIter++; + } + MIBIter++; + } + + // Perform cloning if not yet done. + CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); + + OrigAllocsThinBackend++; + AllocVersionsThinBackend += AllocNode.Versions.size(); + if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) + MaxAllocVersionsThinBackend = AllocNode.Versions.size(); + + // If there is only one version that means we didn't end up + // considering this function for cloning, and in that case the alloc + // will still be none type or should have gotten the default NotCold. + // Skip that after calling clone helper since that does some sanity + // checks that confirm we haven't decided yet that we need cloning. + if (AllocNode.Versions.size() == 1) { + assert((AllocationType)AllocNode.Versions[0] == + AllocationType::NotCold || + (AllocationType)AllocNode.Versions[0] == + AllocationType::None); + UnclonableAllocsThinBackend++; + continue; + } + + // All versions should have a singular allocation type. + assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { + return Type == ((uint8_t)AllocationType::NotCold | + (uint8_t)AllocationType::Cold); + })); + + // Update the allocation types per the summary info. + for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { + // Ignore any that didn't get an assigned allocation type. + if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) + continue; + AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; + AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); + auto A = llvm::Attribute::get(F.getContext(), "memprof", + AllocTypeString); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + // Since VMaps are only created for new clones, we index with + // clone J-1 (J==0 is the original clone and does not have a VMaps + // entry). + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->addFnAttr(A); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) + << NV("AllocationCall", CBClone) << " in clone " + << NV("Caller", CBClone->getFunction()) + << " marked with memprof allocation attribute " + << NV("Attribute", AllocTypeString)); + } + } else if (!CallsiteContext.empty()) { + // Consult the next callsite node. + assert(SI != FS->callsites().end()); + auto &StackNode = *(SI++); + +#ifndef NDEBUG + // Sanity check that the stack ids match between the summary and + // instruction metadata. + auto StackIdIndexIter = StackNode.StackIdIndices.begin(); + for (auto StackId : CallsiteContext) { + assert(StackIdIndexIter != StackNode.StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + StackId); + StackIdIndexIter++; + } +#endif + + // Perform cloning if not yet done. + CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); + + // Should have skipped indirect calls via mayHaveMemprofSummary. + assert(CB->getCalledFunction()); + assert(!IsMemProfClone(*CB->getCalledFunction())); + + // Update the calls per the summary info. + // Save orig name since it gets updated in the first iteration + // below. + auto CalleeOrigName = CB->getCalledFunction()->getName(); + for (unsigned J = 0; J < StackNode.Clones.size(); J++) { + // Do nothing if this version calls the original version of its + // callee. + if (!StackNode.Clones[J]) + continue; + auto NewF = M.getOrInsertFunction( + getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), + CB->getCalledFunction()->getFunctionType()); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->setCalledFunction(NewF); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) + << NV("Call", CBClone) << " in clone " + << NV("Caller", CBClone->getFunction()) + << " assigned to call function clone " + << NV("Callee", NewF.getCallee())); + } + } + // Memprof and callsite metadata on memory allocations no longer needed. + I.setMetadata(LLVMContext::MD_memprof, nullptr); + I.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + } + + return Changed; +} + template bool CallsiteContextGraph::process() { if (DumpCCG) { @@ -2149,21 +3157,61 @@ if (ExportToDot) exportToDot("cloned"); - return false; + bool Changed = assignFunctions(); + + if (DumpCCG) { + dbgs() << "CCG after assigning function clones:\n"; + dbgs() << *this; + } + if (ExportToDot) + exportToDot("clonefuncassign"); + + return Changed; } -bool MemProfContextDisambiguation::processModule(Module &M) { +bool MemProfContextDisambiguation::processModule( + Module &M, + function_ref OREGetter) { bool Changed = false; - ModuleCallsiteContextGraph CCG(M); + // If we have an import summary, then the cloning decisions were made during + // the thin link on the index. Apply them and return. + if (ImportSummary) { + Changed = applyImport(M); + return Changed; + } + + ModuleCallsiteContextGraph CCG(M, OREGetter); Changed = CCG.process(); return Changed; } +MemProfContextDisambiguation::MemProfContextDisambiguation( + const ModuleSummaryIndex *Summary) + : ImportSummary(Summary) { + // The MemProfImportSummary should only be used for testing ThinLTO + // distributed backend handling via opt, in which case we don't have a summary + // from the pass pipeline. + assert(!ImportSummary || MemProfImportSummary.empty()); + if (!ImportSummary && !MemProfImportSummary.empty()) { + ExitOnError ExitOnErr("-memprof-import-summary: " + MemProfImportSummary + + ": "); + auto ReadSummaryFile = ExitOnErr( + errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary))); + ImportSummaryForTesting = + ExitOnErr(getModuleSummaryIndex(*ReadSummaryFile)); + ImportSummary = ImportSummaryForTesting.get(); + } +} + PreservedAnalyses MemProfContextDisambiguation::run(Module &M, ModuleAnalysisManager &AM) { - if (!processModule(M)) + auto &FAM = AM.getResult(M).getManager(); + auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { + return FAM.getResult(*F); + }; + if (!processModule(M, OREGetter)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -2174,4 +3222,5 @@ isPrevailing) { IndexCallsiteContextGraph CCG(Index, isPrevailing); CCG.process(); + Index.setWithMemProfContextDisambiguation(); } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -39,18 +39,49 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should have cloned bar, baz, and foo, for the cold memory allocation. +; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS source_filename = "memprof-basic.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !0 %call1 = call ptr @_Z3foov(), !callsite !1 @@ -61,7 +92,7 @@ declare i32 @sleep() -define internal ptr @_Z3barv() { +define internal ptr @_Z3barv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 ret ptr null @@ -69,13 +100,13 @@ declare ptr @_Znam(i64) -define internal ptr @_Z3bazv() { +define internal ptr @_Z3bazv() #0 { entry: %call = call ptr @_Z3barv(), !callsite !8 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call = call ptr @_Z3bazv(), !callsite !9 ret ptr null @@ -84,6 +115,8 @@ ; uselistorder directives uselistorder ptr @_Z3foov, { 1, 0 } +attributes #0 = { noinline optnone } + !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{!3, !5} @@ -227,6 +260,52 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; @@ -258,3 +337,9 @@ ; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"]; ; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"]; ; DOTCLONED: } + + +; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1) +; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1) +; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -60,19 +61,51 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST ;; We should clone D once for the cold allocations via C. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST +;; We should clone D once for the cold allocations via C. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS source_filename = "duplicate-context-ids.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define internal ptr @_Z1Dv() { +define internal ptr @_Z1Dv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 ret ptr null @@ -80,36 +113,44 @@ declare ptr @_Znam(i64) -define internal ptr @_Z1Fv() { +define internal ptr @_Z1Fv() #0 { entry: %call = call ptr @_Z1Dv(), !callsite !6 ret ptr null } -define internal ptr @_Z1Cv() { +define internal ptr @_Z1Cv() #0 { entry: %call = call ptr @_Z1Dv(), !callsite !7 ret ptr null } -define internal ptr @_Z1Bv() { +define internal ptr @_Z1Bv() #0 { entry: %call.i = call ptr @_Z1Dv(), !callsite !8 ret ptr null } -define internal ptr @_Z1Ev() { +define internal ptr @_Z1Ev() #0 { entry: %call.i = call ptr @_Z1Dv(), !callsite !9 ret ptr null } -declare i32 @main() +define i32 @main() #0 { +entry: + call ptr @_Z1Bv() + call ptr @_Z1Ev() + call ptr @_Z1Fv() + ret i32 0 +} declare void @_ZdaPv() declare i32 @sleep() +attributes #0 = { noinline optnone} + !0 = !{!1, !3} !1 = !{!2, !"cold"} !2 = !{i64 6541423618768552252, i64 -6270142974039008131} @@ -267,6 +308,44 @@ ; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 ; DUMP: Clone of [[D]] +; REMARKS: created clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1 + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Dv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; @@ -305,3 +384,9 @@ ; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"]; ; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"]; ; DOTCLONED: } + +; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1) +; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1) +; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0) +; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll @@ -0,0 +1,291 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + + +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline optnone +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 + %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 + ret void +} + +declare ptr @_Znam(i64) + +define internal void @_Z1BPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 + ret void +} + +define internal void @_Z1CPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 + ret void +} + +; Function Attrs: noinline optnone +define i32 @main() #0 { +entry: + call void @_Z1BPPcS0_() + call void @_Z1CPPcS0_() + call void @_Z1DPPcS0_() + ret i32 0 +} + +declare void @_ZdaPv() + +declare i32 @sleep() + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0 } + +attributes #0 = { noinline optnone } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[C]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.memprof.1 +; REMARKS: created clone _Z1EPPcS0_.memprof.2 +; REMARKS: created clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2 + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 8 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 4 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -61,13 +61,44 @@ ; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should only create a single clone of foo, for the direct call +;; from main allocating cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS source_filename = "indirectcall.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -76,12 +107,12 @@ @_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr @_ZTVN10__cxxabiv117__class_type_infoE = external global ptr -define internal ptr @_Z3barP1A(ptr %a) { +define internal ptr @_Z3barP1A(ptr %a) #0 { entry: ret ptr null } -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !0 %call1 = call ptr @_Z3foov(), !callsite !1 @@ -96,19 +127,19 @@ declare i32 @sleep() -define internal ptr @_ZN1A1xEv() { +define internal ptr @_ZN1A1xEv() #0 { entry: %call = call ptr @_Z3foov(), !callsite !6 ret ptr null } -define internal ptr @_ZN1B1xEv() { +define internal ptr @_ZN1B1xEv() #0 { entry: %call = call ptr @_Z3foov(), !callsite !7 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21 ret ptr null @@ -119,6 +150,8 @@ ; uselistorder directives uselistorder ptr @_Z3foov, { 3, 2, 1, 0 } +attributes #0 = { noinline optnone } + !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{i64 6792096022461663180} @@ -359,6 +392,41 @@ ; DUMP: Clone of [[FOO]] +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold + + +; IR: define {{.*}} @main( +; IR: call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"]; diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -48,19 +49,50 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE \ +; RUN: --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should create clones for foo and bar for the call from main to allocate +;; cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \ +; RUN: --check-prefix=REMARKS source_filename = "inlined.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define internal ptr @_Z3barv() { +define internal ptr @_Z3barv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 ret ptr null @@ -68,19 +100,19 @@ declare ptr @_Znam(i64) -define internal ptr @_Z3bazv() { +define internal ptr @_Z3bazv() #0 { entry: %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call.i = call ptr @_Z3barv(), !callsite !7 ret ptr null } -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !8 %call1 = call ptr @_Z3foov(), !callsite !9 @@ -91,6 +123,8 @@ declare i32 @sleep() +attributes #0 = { noinline optnone } + !0 = !{!1, !3} !1 = !{!2, !"notcold"} !2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} @@ -257,6 +291,52 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 + + +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3barv() +; IR: define {{.*}} @main() +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-INPROCESS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 3 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 2 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-INPROCESS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -1,5 +1,5 @@ ;; Test callsite context graph generation for simple call graph with -;; two memprof contexts and no inlining. +;; two memprof contexts and no inlining, as well as graph and IR cloning. ;; ;; Original code looks like: ;; @@ -34,7 +34,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -222,6 +224,48 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv +; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -55,7 +56,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -263,6 +266,39 @@ ; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 ; DUMP: Clone of [[D]] +; REMARKS: created clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Dv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + ; DOTPRE: digraph "prestackupdate" { ; DOTPRE: label="prestackupdate"; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll @@ -0,0 +1,244 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. +;; +;; The IR was then reduced using llvm-reduce with the expected FileCheck input. + +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 + ret void +} + +declare ptr @_Znam(i64) #1 + +define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 + ret void +} + +; Function Attrs: noinline +define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +declare i32 @sleep() #5 + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0 } + +attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #1 = { "no-trapping-math"="true" } +attributes #2 = { noinline } +attributes #3 = { "frame-pointer"="all" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { "disable-tail-calls"="true" } +attributes #6 = { builtin } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[D:0x[a-z0-9]+]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 1 6 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[C]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 3 4 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.memprof.1 +; REMARKS: created clone _Z1EPPcS0_.memprof.2 +; REMARKS: created clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_ +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.2( +;; Transient clone that will get removed as it ends up with no callers. +;; Its calls to new never get updated with a memprof attribute as a result. +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[DEFAULT]] = { builtin } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -54,7 +54,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -340,6 +342,41 @@ ; DUMP: Clone of [[FOO]] +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold + + +; IR: define {{.*}} @main( +; IR: call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_ZN1A1xEv( +; IR: call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_ZN1B1xEv( +; IR: call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"]; diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll --- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -43,7 +44,9 @@ ; RUN: opt -passes=memprof-context-disambiguation \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -251,6 +254,42 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold + + +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3barv() +; IR: define {{.*}} @main() +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin "memprof"="cold" } + + +; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph "postbuild" { ; DOT: label="postbuild"; ; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];