diff --git a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h --- a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h +++ b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h @@ -99,6 +99,10 @@ ImmutablePass * createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index); +/// Returns true if the instruction could have memprof metadata, used to ensure +/// consistency between summary analysis and the ThinLTO backend processing. +bool mayHaveMemprofSummary(const CallBase *CB); + } // end namespace llvm #endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1564,6 +1564,11 @@ Attrs = Attrs.removeFnAttribute(getContext(), Kind); } + /// Removes the attribute from the function + void removeFnAttr(StringRef Kind) { + Attrs = Attrs.removeFnAttribute(getContext(), Kind); + } + /// Removes the attribute from the return value void removeRetAttr(Attribute::AttrKind Kind) { Attrs = Attrs.removeRetAttribute(getContext(), Kind); diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1300,6 +1300,10 @@ /// Indicates that summary-based synthetic entry count propagation has run bool HasSyntheticEntryCounts = false; + /// Indicates that summary-based profile guided heap optimization context + /// disambigution has run. + bool WithPGHOContextDisambiguation = false; + /// Indicates that distributed backend should skip compilation of the /// module. Flag is suppose to be set by distributed ThinLTO indexing /// when it detected that the module is not needed during the final @@ -1503,6 +1507,13 @@ bool hasSyntheticEntryCounts() const { return HasSyntheticEntryCounts; } void setHasSyntheticEntryCounts() { HasSyntheticEntryCounts = true; } + bool withPGHOContextDisambiguation() const { + return WithPGHOContextDisambiguation; + } + void setWithPGHOContextDisambiguation() { + WithPGHOContextDisambiguation = true; + } + bool skipModuleByDistributedBackend() const { return SkipModuleByDistributedBackend; } diff --git a/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h @@ -17,22 +17,36 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringSet.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/PassManager.h" #include namespace llvm { class GlobalValueSummary; class Module; -class ModuleSummaryIndex; +class OptimizationRemarkEmitter; class PGHOContextDisambiguation : public PassInfoMixin { /// Run the context disambiguator on \p TheModule, returns true if any changes /// was made. - bool processModule(Module &M); + bool processModule( + Module &M, + function_ref OREGetter); + + /// In the ThinLTO backend, apply the cloning decisions in ImportSummary to + /// the IR. + bool applyImport(Module &M); + + /// Import summary containing cloning decisions for the ThinLTO backend. + const ModuleSummaryIndex *ImportSummary; + + // Owns the import summary specified by internal options for testing the + // ThinLTO backend via opt (to simulate distributed ThinLTO). + std::unique_ptr ImportSummaryForTesting; public: - PGHOContextDisambiguation() {} + PGHOContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr); PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -282,6 +282,10 @@ std::vector Callsites; std::vector Allocs; +#ifndef NDEBUG + DenseSet CallsThatMayHaveMemprofSummary; +#endif + bool HasInlineAsmMaybeReferencingInternal = false; bool HasIndirBranchToBlockAddress = false; bool HasUnknownCall = false; @@ -425,6 +429,10 @@ .updateHotness(getHotness(Candidate.Count, PSI)); } + // Summarize memprof related metadata. This is only needed for ThinLTO. + if (!IsThinLTO) + continue; + // TODO: Skip indirect calls for now. Need to handle these better, likely // by creating multiple Callsites, one per target, then speculatively // devirtualize while applying clone info in the ThinLTO backends. This @@ -435,6 +443,14 @@ if (!CalledFunction) continue; + // Ensure we keep this analysis in sync with the handling in the ThinLTO + // backend (see PGHOContextDisambiguation::applyImport). Save this call + // so that we can skip it in checking the reverse case later. + assert(mayHaveMemprofSummary(CB)); +#ifndef NDEBUG + CallsThatMayHaveMemprofSummary.insert(CB); +#endif + // Compute the list of stack ids first (so we can trim them from the stack // ids on any MIBs). CallStack InstCallsite( @@ -542,6 +558,25 @@ ? CalleeInfo::HotnessType::Cold : CalleeInfo::HotnessType::Critical); +#ifndef NDEBUG + // Make sure that all calls we decided could not have memprof summaries get a + // false value for mayHaveMemprofSummary, to ensure that this handling remains + // in sync with the ThinLTO backend handling. + if (IsThinLTO) { + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + const auto *CB = dyn_cast(&I); + if (!CB) + continue; + // We already checked these above. + if (CallsThatMayHaveMemprofSummary.count(CB)) + continue; + assert(!mayHaveMemprofSummary(CB)); + } + } + } +#endif + bool NonRenamableLocal = isNonRenamableLocal(F); bool NotEligibleForImport = NonRenamableLocal || HasInlineAsmMaybeReferencingInternal || @@ -1033,3 +1068,36 @@ INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info", "Module summary info", false, true) + +bool llvm::mayHaveMemprofSummary(const CallBase *CB) { + if (!CB) + return false; + if (CB->isDebugOrPseudoInst()) + return false; + auto *CI = dyn_cast(CB); + auto *CalledValue = CB->getCalledOperand(); + auto *CalledFunction = CB->getCalledFunction(); + if (CalledValue && !CalledFunction) { + CalledValue = CalledValue->stripPointerCasts(); + // Stripping pointer casts can reveal a called function. + CalledFunction = dyn_cast(CalledValue); + } + // Check if this is an alias to a function. If so, get the + // called aliasee for the checks below. + if (auto *GA = dyn_cast(CalledValue)) { + assert(!CalledFunction && + "Expected null called function in callsite for alias"); + CalledFunction = dyn_cast(GA->getAliaseeObject()); + } + // Check if this is a direct call to a known function or a known + // intrinsic, or an indirect call with profile data. + if (CalledFunction) { + if (CI && CalledFunction->isIntrinsic()) + return false; + } else { + // TODO: For now skip indirect calls. See comments in + // computeFunctionSummary for what is needed to handle this. + return false; + } + return true; +} diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8018,7 +8018,7 @@ case bitc::FS_FLAGS: { // [flags] uint64_t Flags = Record[0]; // Scan flags. - assert(Flags <= 0xff && "Unexpected bits in flag"); + assert(Flags <= 0x1ff && "Unexpected bits in flag"); return Flags & 0x8; } diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -107,11 +107,13 @@ Flags |= 0x40; if (withWholeProgramVisibility()) Flags |= 0x80; + if (withPGHOContextDisambiguation()) + Flags |= 0x100; return Flags; } void ModuleSummaryIndex::setFlags(uint64_t Flags) { - assert(Flags <= 0xff && "Unexpected bits in flag"); + assert(Flags <= 0x1ff && "Unexpected bits in flag"); // 1 bit: WithGlobalValueDeadStripping flag. // Set on combined index only. if (Flags & 0x1) @@ -145,6 +147,10 @@ // Set on combined index only. if (Flags & 0x80) setWithWholeProgramVisibility(); + // 1 bit: WithPGHOContextDisambiguation flag. + // Set on combined index only. + if (Flags & 0x100) + setWithPGHOContextDisambiguation(); } // Collect for the given module the list of function it defines diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1506,6 +1506,11 @@ MPM.addPass(Annotation2MetadataPass()); if (ImportSummary) { + // For ThinLTO we must apply the context disambiguation decisions early, to + // ensure we can correctly match the callsites to summary data. + if (EnablePGHOContextDisambiguation) + MPM.addPass(PGHOContextDisambiguation(ImportSummary)); + // These passes import type identifier resolutions for whole-program // devirtualization and CFI. They must run early because other passes may // disturb the specific instruction patterns that these passes look for, diff --git a/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp @@ -27,8 +27,11 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -38,13 +41,40 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" #include #include using namespace llvm; using namespace llvm::memprof; +using namespace ore; #define DEBUG_TYPE "pgho-context-disambiguation" +STATISTIC(FunctionClonesAnalysis, + "Number of function clones created during whole program analysis"); +STATISTIC(FunctionClonesThinBackend, + "Number of function clones created during ThinLTO backend"); +STATISTIC(FunctionsClonedThinBackend, + "Number of functions that had clones created during ThinLTO backend"); +STATISTIC(AllocTypeNotCold, + "Number of not cold static allocations (possibly cloned)"); +STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned)"); +STATISTIC(AllocTypeNotColdThinBackend, + "Number of not cold static allocations (possibly cloned) during " + "ThinLTO backend"); +STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " + "(possibly cloned) during ThinLTO backend"); +STATISTIC(OrigAllocsThinBackend, + "Number of original (not cloned) allocations with memprof profiles " + "during ThinLTO backend"); +STATISTIC( + AllocVersionsThinBackend, + "Number of allocation versions (including clones) during ThinLTO backend"); +STATISTIC(MaxAllocVersionsThinBackend, + "Maximum number of allocation versions created for an original " + "allocation during ThinLTO backend"); +STATISTIC(UnclonableAllocsThinBackend, + "Number of unclonable ambigous allocations during ThinLTO backend"); static cl::opt DotFilePathPrefix( "pgho-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -65,6 +95,11 @@ VerifyNodes("pgho-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes.")); +static cl::opt PGHOImportSummary( + "pgho-import-summary", + cl::desc("Import summary to use for testing the ThinLTO backend via opt"), + cl::Hidden); + inline bool hasSingleAllocType(uint8_t AllocTypes) { switch (AllocTypes) { case (uint8_t)AllocationType::Cold: @@ -111,6 +146,8 @@ /// behavior of an allocation based on its context. void identifyClones(); + bool assignFunctions(); + void dump() const; void print(raw_ostream &OS) const; @@ -369,6 +406,28 @@ return static_cast(this)->getLastStackId(Call); } + /// Update the allocation call to record type of allocated memory. + void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { + AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; + static_cast(this)->updateAllocationCall(Call, AllocType); + } + + /// Update non-allocation call to invoke (possibly cloned) function + /// CalleeFunc. + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { + static_cast(this)->updateCall(CallerCall, CalleeFunc); + } + + /// Clone the given function for the given callsite, recording mapping of all + /// of the functions tracked calls to their new versions in the CallMap. + /// Assigns new clones to clone number CloneNo. + FuncInfo cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + return static_cast(this)->cloneFunctionForCallsite( + Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); + } + /// Gets a label to use in the dot graph for the given call clone in the given /// function. std::string getLabel(const FuncTy *Func, const CallTy Call, @@ -461,7 +520,9 @@ : public CallsiteContextGraph { public: - ModuleCallsiteContextGraph(Module &M); + ModuleCallsiteContextGraph( + Module &M, + function_ref OREGetter); private: friend CallsiteContextGraph getStackIdsWithContextNodesForCall(Instruction *Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const; const Module &Mod; + function_ref OREGetter; }; /// Represents a call in the summary index graph, which can either be an @@ -517,6 +587,14 @@ bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); + void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); + CallsiteContextGraph::FuncInfo + cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, + std::map &CallMap, + std::vector &CallsWithMetadataInFunc, + unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, unsigned CloneNo) const; @@ -1231,7 +1309,9 @@ return StackIds; } -ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) { +ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( + Module &M, function_ref OREGetter) + : Mod(M), OREGetter(OREGetter) { for (auto &F : M) { for (auto &BB : F) { for (auto &I : BB) { @@ -2063,6 +2143,874 @@ checkNode(Node, /*CheckEdges=*/true); } +static std::string getAllocTypeAttributeString(AllocationType Type) { + switch (Type) { + case AllocationType::NotCold: + return "notcold"; + break; + case AllocationType::Cold: + return "cold"; + break; + default: + dbgs() << "Unexpected alloc type " << (uint8_t)Type; + assert(false); + } + llvm_unreachable("invalid alloc type"); +} + +void ModuleCallsiteContextGraph::updateAllocationCall( + CallInfo &Call, AllocationType AllocType) { + std::string AllocTypeString = getAllocTypeAttributeString(AllocType); + auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), + "memprof", AllocTypeString); + cast(Call.call())->addFnAttr(A); + OREGetter(Call.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) + << NV("AllocationCall", Call.call()) << " in clone " + << NV("Caller", Call.call()->getFunction()) + << " marked with memprof allocation attribute " + << NV("Attribute", AllocTypeString)); +} + +void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, + AllocationType AllocType) { + auto *AI = Call.call().dyn_cast(); + assert(AI); + assert(AI->Versions.size() > Call.cloneNo()); + AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; +} + +void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + if (CalleeFunc.cloneNo() > 0) + cast(CallerCall.call())->setCalledFunction(CalleeFunc.func()); + OREGetter(CallerCall.call()->getFunction()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) + << NV("Call", CallerCall.call()) << " in clone " + << NV("Caller", CallerCall.call()->getFunction()) + << " assigned to call function clone " + << NV("Callee", CalleeFunc.func())); +} + +void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, + FuncInfo CalleeFunc) { + auto *CI = CallerCall.call().dyn_cast(); + // Caller cannot be an allocation. + assert(CI); + assert(CI->Clones.size() > CallerCall.cloneNo()); + CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); +} + +CallsiteContextGraph::FuncInfo +ModuleCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Use existing LLVM facilities for cloning and obtaining Call in clone + ValueToValueMapTy VMap; + auto *NewFunc = CloneFunction(Func.func(), VMap); + std::string Name = getPGHOFuncName(Func.func()->getName(), CloneNo); + assert(!Func.func()->getParent()->getFunction(Name)); + NewFunc->setName(Name); + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + CallMap[Inst] = {cast(VMap[Inst.call()]), CloneNo}; + } + OREGetter(Func.func()) + .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) + << "created clone " << NV("NewFunction", NewFunc)); + return {NewFunc, CloneNo}; +} + +CallsiteContextGraph::FuncInfo +IndexCallsiteContextGraph::cloneFunctionForCallsite( + FuncInfo &Func, CallInfo &Call, std::map &CallMap, + std::vector &CallsWithMetadataInFunc, unsigned CloneNo) { + // Check how many clones we have of Call (and therefore function). + // The next clone number is the current size of versions array. + // Confirm this matches the CloneNo provided by the caller, which is based on + // the number of function clones we have. + assert(CloneNo == + (Call.call().is() + ? Call.call().dyn_cast()->Versions.size() + : Call.call().dyn_cast()->Clones.size())); + // Walk all the instructions in this function. Create a new version for + // each (by adding an entry to the Versions/Clones summary array), and copy + // over the version being called for the function clone being cloned here. + // Additionally, add an entry to the CallMap for the new function clone, + // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) + // to the new call clone. + for (auto &Inst : CallsWithMetadataInFunc) { + // This map always has the initial version in it. + assert(Inst.cloneNo() == 0); + if (auto *AI = Inst.call().dyn_cast()) { + assert(AI->Versions.size() == CloneNo); + // We assign the allocation type later (in updateAllocationCall), just add + // an entry for it here. + AI->Versions.push_back(0); + } else { + auto *CI = Inst.call().dyn_cast(); + assert(CI && CI->Clones.size() == CloneNo); + // We assign the clone number later (in updateCall), just add an entry for + // it here. + CI->Clones.push_back(0); + } + CallMap[Inst] = {Inst.call(), CloneNo}; + } + return {Func.func(), CloneNo}; +} + +template +bool CallsiteContextGraph::assignFunctions() { + bool Changed = false; + + // Keep track of the assignment of nodes (callsites) to function clones they + // call. + std::map CallsiteToCalleeFuncCloneMap; + + // Update caller node to call function version CalleeFunc, by recording the + // assignment in CallsiteToCalleeFuncCloneMap. + auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, + const FuncInfo &CalleeFunc) { + CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; + }; + + // Walk all functions for which we saw calls with memprof metadata, and handle + // cloning for each of its calls. + for (auto &FuncEntry : FuncToCallsWithMetadata) { + FuncInfo OrigFunc(FuncEntry.first); + // Map from each clone of OrigFunc to a map of remappings of each call of + // interest (from original uncloned call to the corresponding cloned call in + // that function clone). + std::map> FuncClonesToCallMap; + for (auto Call : FuncEntry.second) { + ContextNode *Node = getNodeForInst(Call); + // Skip call if we do not have a node for it (all uses of its stack ids + // were either on inlined chains or pruned from the MIBs), or if we did + // not create any clones for it. + if (!Node || Node->Clones.empty()) + continue; + // Not having a call should have prevented cloning. + assert(Node->hasCall()); + + // Track the assignment of function clones to clones of the current + // callsite Node being handled. + std::map FuncCloneToCurNodeCloneMap; + + // Assign callsite version CallsiteClone to function version FuncClone, + // and also assign (possibly cloned) Call to CallsiteClone. + auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, + CallInfo &Call, + ContextNode *CallsiteClone, + bool IsAlloc) { + // Record the clone of callsite node assigned to this function clone. + FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; + + assert(FuncClonesToCallMap.count(FuncClone)); + std::map &CallMap = FuncClonesToCallMap[FuncClone]; + CallInfo CallClone(Call); + if (CallMap.count(Call)) + CallClone = CallMap[Call]; + CallsiteClone->setCall(CallClone); + }; + + // Keep track of the clones of callsite Node that need to be assigned to + // function clones. This list may be expanded in the loop body below if we + // find additional cloning is required. + std::vector Clones(Node->Clones); + // Ignore original Node if we moved all of its contexts to clones. + if (!Node->ContextIds.empty()) + Clones.insert(Clones.begin(), Node); + + // Now walk through all of the clones of this callsite Node that we need, + // and determine the assignment to a corresponding clone of the current + // function (creating new function clones as needed). + for (unsigned I = 0; I < Clones.size(); I++) { + ContextNode *Clone = Clones[I]; + if (VerifyNodes) + checkNode(Clone, /*CheckEdges=*/true); + + // Need to create a new function clone if we have more callsite clones + // than existing function clones, which would have been assigned to an + // earlier clone in the list (we assign callsite clones to function + // clones greedily). + if (FuncClonesToCallMap.size() <= I) { + // If this is the first callsite copy, assign to original function. + if (I == 0) { + // Since FuncClonesToCallMap is empty in this case, no clones have + // been created for this function yet, and no callers should have + // been assigned a function clone for this callee node yet. + assert(llvm::none_of(Clone->CallerEdges, [&](ContextEdge *E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + })); + // Initialize with empty call map, assign Clone to original function + // and its callers, and skip to the next clone. + FuncClonesToCallMap[OrigFunc] = {}; + AssignCallsiteCloneToFuncClone( + OrigFunc, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto CE : Clone->CallerEdges) + RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); + continue; + } + + // First locate which copy of OrigFunc to clone again. If a caller + // of this callsite clone was already assigned to call a particular + // function clone, we need to redirect all of those callers to the + // new function clone, and update their other callees within this + // function. + FuncInfo PreviousAssignedFuncClone; + auto EI = llvm::find_if(Clone->CallerEdges, [&](ContextEdge *E) { + return CallsiteToCalleeFuncCloneMap.count(E->Caller); + }); + bool CallerAssignedToCloneOfFunc = false; + if (EI != Clone->CallerEdges.end()) { + ContextEdge *Edge = *EI; + PreviousAssignedFuncClone = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + CallerAssignedToCloneOfFunc = true; + } + + // Clone function and save it along with the CallInfo map created + // during cloning in the FuncClonesToCallMap. + std::map NewCallMap; + unsigned CloneNo = FuncClonesToCallMap.size(); + // Clone 0 is the original function, which should already exist in the + // map. + assert(CloneNo > 0); + FuncInfo NewFuncClone = cloneFunctionForCallsite( + OrigFunc, Call, NewCallMap, FuncEntry.second, CloneNo); + FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FunctionClonesAnalysis++; + Changed = true; + + // If no caller callsites were already assigned to a clone of this + // function, we can simply assign this clone to the new func clone + // and update all callers to it, then skip to the next clone. + if (!CallerAssignedToCloneOfFunc) { + AssignCallsiteCloneToFuncClone( + NewFuncClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + for (auto CE : Clone->CallerEdges) + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + continue; + } + + // We may need to do additional node cloning in this case. + // Reset the CallsiteToCalleeFuncCloneMap entry for any callers + // that were previously assigned to call PreviousAssignedFuncClone, + // to record that they now call NewFuncClone. + for (auto CE : Clone->CallerEdges) { + if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || + // We subsequently fall through to later handling that + // will perform any additional cloning required for + // callers that were calling other function clones. + CallsiteToCalleeFuncCloneMap[CE->Caller] != + PreviousAssignedFuncClone) + continue; + + RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); + + // If we are cloning a function that was already assigned to some + // callers, then essentially we are creating new callsite clones + // of the other callsites in that function that are reached by those + // callers. Clone the other callees of the current callsite's caller + // that were already assigned to PreviousAssignedFuncClone + // accordingly. This is important since we subsequently update the + // calls from the nodes in the graph and their assignments to callee + // functions recorded in CallsiteToCalleeFuncCloneMap. + for (auto CalleeEdge : CE->Caller->CalleeEdges) { + ContextNode *Callee = CalleeEdge->Callee; + // Skip the current callsite, we are looking for other + // callsites Caller calls. + if (Callee == Clone) + continue; + if (!Callee->hasCall()) + continue; + // Skip any that have been removed on an earlier iteration when + // cleaning up newly None type callee edges. + if (CalleeEdge->Callee == nullptr && + CalleeEdge->Caller == nullptr) { + assert(RemovedEdges.count(CalleeEdge)); + continue; + } + ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); + removeNoneTypeCalleeEdges(NewClone); + // Moving the edge may have resulted in some none type + // callee edges on the original Callee. + removeNoneTypeCalleeEdges(Callee); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + // If the Callee node was already assigned to call a specific + // function version, make sure its new clone is assigned to call + // that same function clone. + if (CallsiteToCalleeFuncCloneMap.count(Callee)) + RecordCalleeFuncOfCallsite( + NewClone, CallsiteToCalleeFuncCloneMap[Callee]); + // Update NewClone with the new Call clone of this callsite's Call + // created for the new function clone created earlier. + // Recall that we have already ensured when building the graph + // that each caller can only call callsites within the same + // function, so we are guaranteed that Callee Call is in the + // current OrigFunc. + // CallMap is set up as indexed by original Call at clone 0. + CallInfo OrigCall(Callee->getOrigNode()->Call); + OrigCall.setCloneNo(0); + std::map &CallMap = + FuncClonesToCallMap[NewFuncClone]; + assert(CallMap.count(OrigCall)); + CallInfo NewCall(CallMap[OrigCall]); + assert(NewCall); + NewClone->setCall(NewCall); + } + } + // Fall through to handling below to perform the recording of the + // function for this callsite clone. This enables handling of cases + // where the callers were assigned to different clones of a function. + } + + // See if we can use existing function clone. Walk through + // all caller edges to see if any have already been assigned to + // a clone of this callsite's function. If we can use it, do so. If not, + // because that function clone is already assigned to a different clone + // of this callsite, then we need to clone again. + // Basically, this checking is needed to handle the case where different + // caller functions/callsites may need versions of this function + // containing different mixes of callsite clones across the different + // callsites within the function. If that happens, we need to create + // additional function clones to handle the various combinations. + // + // Keep track of any new clones of this callsite created by the + // following loop, as well as any existing clone that we decided to + // assign this clone to. + std::map FuncCloneToNewCallsiteCloneMap; + FuncInfo FuncCloneAssignedToCurCallsiteClone; + // We need to be able to remove Edge from CallerEdges, so need to adjust + // iterator in the loop. + for (auto EI = Clone->CallerEdges.begin(); + EI != Clone->CallerEdges.end();) { + auto *Edge = *EI; + // If this caller already assigned to call a version of OrigFunc, need + // to ensure we can assign this callsite clone to that function clone. + if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { + FuncInfo FuncCloneCalledByCaller = + CallsiteToCalleeFuncCloneMap[Edge->Caller]; + // First we need to confirm that this function clone is available + // for use by this callsite node clone. + // + // While FuncCloneToCurNodeCloneMap is built only for this Node and + // its callsite clones, one of those callsite clones X could have + // been assigned to the same function clone called by Edge's caller + // - if Edge's caller calls another callsite within Node's original + // function, and that callsite has another caller reaching clone X. + // We need to clone Node again in this case. + if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && + FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != + Clone) || + // Detect when we have multiple callers of this callsite that + // have already been assigned to specific, and different, clones + // of OrigFunc (due to other unrelated callsites in Func they + // reach via call contexts). Is this Clone of callsite Node + // assigned to a different clone of OrigFunc? If so, clone Node + // again. + (FuncCloneAssignedToCurCallsiteClone && + FuncCloneAssignedToCurCallsiteClone != + FuncCloneCalledByCaller)) { + // We need to use a different newly created callsite clone, in + // order to assign it to another new function clone on a + // subsequent iteration over the Clones array (adjusted below). + // Note we specifically do not reset the + // CallsiteToCalleeFuncCloneMap entry for this caller, so that + // when this new clone is processed later we know which version of + // the function to copy (so that other callsite clones we have + // assigned to that function clone are properly cloned over). See + // comments in the function cloning handling earlier. + + // Check if we already have cloned this callsite again while + // walking through caller edges, for a caller calling the same + // function clone. If so, we can move this edge to that new clone + // rather than creating yet another new clone. + if (FuncCloneToNewCallsiteCloneMap.count( + FuncCloneCalledByCaller)) { + ContextNode *NewClone = + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; + moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); + // Cleanup any none type edges cloned over. + removeNoneTypeCalleeEdges(NewClone); + } else { + // Create a new callsite clone. + ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); + removeNoneTypeCalleeEdges(NewClone); + FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = + NewClone; + // Add to list of clones and process later. + Clones.push_back(NewClone); + assert(EI == Clone->CallerEdges.end() || + Clone->AllocTypes != (uint8_t)AllocationType::None); + assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); + } + // Moving the caller edge may have resulted in some none type + // callee edges. + removeNoneTypeCalleeEdges(Clone); + // We will handle the newly created callsite clone in a subsequent + // iteration over this Node's Clones. Continue here since we + // already adjusted iterator EI while moving the edge. + continue; + } + + // Otherwise, we can use the function clone already assigned to this + // caller. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; + // Assign Clone to FuncCloneCalledByCaller + AssignCallsiteCloneToFuncClone( + FuncCloneCalledByCaller, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + // Don't need to do anything - callsite is already calling this + // function clone. + assert(FuncCloneAssignedToCurCallsiteClone == + FuncCloneCalledByCaller); + + } else { + // We have not already assigned this caller to a version of + // OrigFunc. Do the assignment now. + + // First check if we have already assigned this callsite clone to a + // clone of OrigFunc for another caller during this iteration over + // its caller edges. + if (!FuncCloneAssignedToCurCallsiteClone) { + // Find first function in FuncClonesToCallMap without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncClonesToCallMap size was smaller than the clone number. + for (auto &CF : FuncClonesToCallMap) { + if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { + FuncCloneAssignedToCurCallsiteClone = CF.first; + break; + } + } + assert(FuncCloneAssignedToCurCallsiteClone); + // Assign Clone to FuncCloneAssignedToCurCallsiteClone + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } else + assert(FuncCloneToCurNodeCloneMap + [FuncCloneAssignedToCurCallsiteClone] == Clone); + // Update callers to record function version called. + RecordCalleeFuncOfCallsite(Edge->Caller, + FuncCloneAssignedToCurCallsiteClone); + } + + EI++; + } + } + if (VerifyCCG) { + checkNode(Node, /*CheckEdges=*/true); + for (auto *PE : Node->CalleeEdges) + checkNode(PE->Callee, + /*CheckEdges=*/true); + for (auto *CE : Node->CallerEdges) + checkNode(CE->Caller, + /*CheckEdges=*/true); + for (unsigned I = 0; I < Clones.size(); I++) { + ContextNode *Clone = Clones[I]; + checkNode(Clone, /*CheckEdges=*/true); + for (auto *PE : Clone->CalleeEdges) + checkNode(PE->Callee, + /*CheckEdges=*/true); + for (auto *CE : Clone->CallerEdges) + checkNode(CE->Caller, + /*CheckEdges=*/true); + } + } + } + } + + // Clean up edges removed during the assignment and additional cloning. + deleteRemovedEdges(); + + auto UpdateCalls = [&](ContextNode *Node, + DenseSet &Visited, + auto &&UpdateCalls) { + auto Inserted = Visited.insert(Node); + if (!Inserted.second) + return; + + for (auto *Clone : Node->Clones) + UpdateCalls(Clone, Visited, UpdateCalls); + + for (auto &Edge : Node->CallerEdges) + UpdateCalls(Edge->Caller, Visited, UpdateCalls); + + // Skip if either no call to update, or if we ended up with no context ids + // (we moved all edges onto other clones). + if (!Node->hasCall() || Node->ContextIds.empty()) + return; + + if (Node->IsAllocation) { + updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + return; + } + + if (!CallsiteToCalleeFuncCloneMap.count(Node)) + return; + + auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; + updateCall(Node->Call, CalleeFunc); + }; + + DenseSet Visited; + for (auto &Entry : AllocationCallToContextNodeMap) + UpdateCalls(Entry.second, Visited, UpdateCalls); + + return Changed; +} + +bool PGHOContextDisambiguation::applyImport(Module &M) { + assert(ImportSummary); + bool Changed = false; + if (!ImportSummary->withPGHOContextDisambiguation()) { + // The profile matcher applies hotness attributes directly for allocations, + // and those will cause us to generate calls to the hot/cold interfaces + // unconditionally. If context disambiguation was not enabled in the thin + // link then assume we don't want these calls (e.g. not linking with + // the appropriate library, or otherwise trying to disable this behavior). + // For now, simply strip existing hotness attributes so they aren't applied, + // and exit early since no cloning decisions were made. + for (auto &F : M) { + for (auto &BB : F) + for (auto &I : BB) { + auto *CI = dyn_cast(&I); + if (CI && CI->hasFnAttr("memprof")) { + CI->removeFnAttr("memprof"); + Changed = true; + } + } + } + return Changed; + } + + auto IsPGHOFunc = [](const Function &F) { + return F.getName().contains(".pgho."); + }; + + // We also need to clone any aliases that reference cloned functions, because + // the modified callsites may invoke via the alias. Keep track of the aliases + // for each function. + std::map> + FuncToAliasMap; + for (auto &A : M.aliases()) { + auto *Aliasee = A.getAliaseeObject(); + if (auto *F = dyn_cast(Aliasee)) + FuncToAliasMap[F].insert(&A); + } + + for (auto &F : M) { + if (F.isDeclaration() || IsPGHOFunc(F)) + continue; + + OptimizationRemarkEmitter ORE(&F); + + SmallVector, 4> VMaps; + bool ClonesCreated = false; + unsigned NumClonesCreated = 0; + auto CloneFuncIfNeeded = [&](unsigned NumClones) { + // We should at least have version 0 which is the original copy. + assert(NumClones > 0); + // If we already performed cloning of this function, confirm that the + // requested number of clones matches (the thin link should ensure the + // number of clones for each constituent callsite is consistent within + // each function), before returning. + if (ClonesCreated) { + assert(NumClonesCreated == NumClones); + return; + } + Changed = true; + ClonesCreated = true; + NumClonesCreated = NumClones; + // If only one copy needed use original. + if (NumClones == 1) + return; + VMaps.reserve(NumClones - 1); + FunctionsClonedThinBackend++; + for (unsigned I = 1; I < NumClones; I++) { + VMaps.emplace_back(new ValueToValueMapTy()); + auto *NewF = CloneFunction(&F, *VMaps.back()); + FunctionClonesThinBackend++; + // Strip memprof and callsite metadata from clone as they are no longer + // needed. + for (auto &BB : *NewF) { + for (auto &Inst : BB) { + Inst.setMetadata(LLVMContext::MD_memprof, nullptr); + Inst.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + std::string Name = getPGHOFuncName(F.getName(), I); + auto *PrevF = M.getFunction(Name); + if (PrevF) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevF->isDeclaration()); + NewF->takeName(PrevF); + PrevF->replaceAllUsesWith(NewF); + PrevF->eraseFromParent(); + } else + NewF->setName(Name); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) + << "created clone " << NV("NewFunction", NewF)); + + // Now handle aliases to this function, and clone those as well. + if (!FuncToAliasMap.count(&F)) + continue; + for (auto *A : FuncToAliasMap[&F]) { + std::string Name = getPGHOFuncName(A->getName(), I); + auto *PrevA = M.getNamedAlias(Name); + auto *NewA = GlobalAlias::create( + A->getValueType(), A->getType()->getPointerAddressSpace(), + A->getLinkage(), Name, NewF); + NewA->copyAttributesFrom(A); + if (PrevA) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevA->isDeclaration()); + NewA->takeName(PrevA); + PrevA->replaceAllUsesWith(NewA); + PrevA->eraseFromParent(); + } + } + } + }; + + // Locate the summary for F. This is complicated by the fact that it might + // have been internalized or promoted. + // FIXME: Ideally we would retain the original GUID in some fashion on the + // function (e.g. as metadata), but for now do our best to locate the + // summary without that information. + ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); + if (!TheFnVI) + // See if theFn was internalized, by checking index directly with + // original name (this avoids the name adjustment done by getGUID() for + // internal symbols). + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); + if (!TheFnVI) { + // Now query with the original name before any promotion was performed. + StringRef OrigName = + ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); + std::string OrigId = GlobalValue::getGlobalIdentifier( + OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); + // Could be a promoted local imported from another module. We need to pass + // down more info here to find the original module id. For now, try with + // the OrigName which might have been stored in the OidGuidMap in the + // index. This would not work if there were same-named locals in multiple + // modules, however. + if (!TheFnVI) { + auto OrigGUID = ImportSummary->getGUIDFromOriginalID( + GlobalValue::getGUID(OrigName)); + if (OrigGUID) + TheFnVI = ImportSummary->getValueInfo(OrigGUID); + } + } + // If still not found, this could be an imported local (see comment above). + // Skip for now as it will be cloned in its original module (where it would + // have been promoted to global scope so should satisfy any reference in + // this module). + if (!TheFnVI) + continue; + + auto *GVSummary = + ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); + if (!GVSummary) + // Must have been imported, use the first summary (might be multiple if + // this was a linkonce_odr). + GVSummary = TheFnVI.getSummaryList().front().get(); + + // If this was an imported alias skip it as we won't have the function + // summary, and it should be cloned in the original module. + if (isa(GVSummary)) + continue; + + auto *FS = cast(GVSummary->getBaseObject()); + + if (FS->allocs().empty() && FS->callsites().empty()) + continue; + + auto SI = FS->callsites().begin(); + auto AI = FS->allocs().begin(); + + // Assume for now that the instructions are in the exact same order + // as when the summary was created, but confirm this is correct by + // matching the stack ids. + for (auto &BB : F) { + for (auto &I : BB) { + auto *CB = dyn_cast(&I); + // Same handling as when creating module summary. + if (!mayHaveMemprofSummary(CB)) + continue; + + CallStack CallsiteContext( + I.getMetadata(LLVMContext::MD_callsite)); + auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); + + // Include allocs that were already assigned a memprof function + // attribute in the statistics. + if (CB->getAttributes().hasFnAttr("memprof")) { + assert(!MemProfMD); + CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + OrigAllocsThinBackend++; + AllocVersionsThinBackend++; + if (!MaxAllocVersionsThinBackend) + MaxAllocVersionsThinBackend = 1; + } + + if (MemProfMD) { + // Consult the next alloc node. + assert(AI != FS->allocs().end()); + auto &AllocNode = *(AI++); + + // Sanity check that the MIB stack ids match between the summary and + // instruction metadata. + auto MIBIter = AllocNode.MIBs.begin(); + for (auto &MDOp : MemProfMD->operands()) { + assert(MIBIter != AllocNode.MIBs.end()); + auto &MIB = *(MIBIter++); + auto StackIdIndexIter = MIB.StackIdIndices.begin(); + auto *MIBMD = cast(MDOp); + MDNode *StackMDNode = getMIBStackNode(MIBMD); + assert(StackMDNode); + SmallVector StackIdsFromMetadata; + CallStack StackContext(StackMDNode); + for (auto ContextIter = + StackContext.beginAfterSharedPrefix(CallsiteContext); + ContextIter != StackContext.end(); ++ContextIter) { + // If this is a direct recursion, simply skip the duplicate + // entries, to be consistent with how the summary ids were + // generated during ModuleSummaryAnalysis. + if (!StackIdsFromMetadata.empty() && + StackIdsFromMetadata.back() == *ContextIter) + continue; + assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + *ContextIter); + StackIdIndexIter++; + } + } + + // Perform cloning if not yet done. + CloneFuncIfNeeded(AllocNode.Versions.size()); + + OrigAllocsThinBackend++; + AllocVersionsThinBackend += AllocNode.Versions.size(); + if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) + MaxAllocVersionsThinBackend = AllocNode.Versions.size(); + + // If there is only one version that means we didn't end up + // considering this function for cloning, and in that case the alloc + // will still be none type or should have gotten the default NotCold. + // Skip that after calling clone helper since that does some sanity + // checks that confirm we haven't decided yet that we need cloning. + if (AllocNode.Versions.size() == 1) { + assert((AllocationType)AllocNode.Versions[0] == + AllocationType::NotCold || + (AllocationType)AllocNode.Versions[0] == + AllocationType::None); + UnclonableAllocsThinBackend++; + continue; + } + + // All versions should have a singular allocation type. + assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { + return Type == ((uint8_t)AllocationType::NotCold | + (uint8_t)AllocationType::Cold); + })); + + // Update the allocation types per the summary info. + for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { + // Ignore any that didn't get an assigned allocation type. + if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) + continue; + AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; + AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); + auto A = llvm::Attribute::get(F.getContext(), "memprof", + AllocTypeString); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->addFnAttr(A); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) + << NV("AllocationCall", CBClone) << " in clone " + << NV("Caller", CBClone->getFunction()) + << " marked with memprof allocation attribute " + << NV("Attribute", AllocTypeString)); + } + } else if (!CallsiteContext.empty()) { + // Consult the next callsite node. + assert(SI != FS->callsites().end()); + auto &StackNode = *(SI++); + + // Sanity check that the stack ids match between the summary and + // instruction metadata. + auto StackIdIndexIter = StackNode.StackIdIndices.begin(); + for (auto StackId : CallsiteContext) { + assert(StackIdIndexIter != StackNode.StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + StackId); + StackIdIndexIter++; + } + + // Perform cloning if not yet done. + CloneFuncIfNeeded(StackNode.Clones.size()); + + // Should have skipped indirect calls via mayHaveMemprofSummary. + assert(CB->getCalledFunction()); + assert(!IsPGHOFunc(*CB->getCalledFunction())); + + // Update the calls per the summary info. + // Save orig name since it gets updated in the first iteration + // below. + auto CalleeOrigName = CB->getCalledFunction()->getName(); + for (unsigned J = 0; J < StackNode.Clones.size(); J++) { + // Do nothing if this version calls the original version of its + // callee. + if (!StackNode.Clones[J]) + continue; + auto NewF = M.getOrInsertFunction( + getPGHOFuncName(CalleeOrigName, StackNode.Clones[J]), + CB->getCalledFunction()->getFunctionType()); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->setCalledFunction(NewF); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) + << NV("Call", CBClone) << " in clone " + << NV("Caller", CBClone->getFunction()) + << " assigned to call function clone " + << NV("Callee", NewF.getCallee())); + } + } + // Memprof and callsite metadata on memory allocations no longer needed. + I.setMetadata(LLVMContext::MD_memprof, nullptr); + I.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + } + + return Changed; +} + template bool CallsiteContextGraph::process() { if (DumpCCG) { @@ -2089,22 +3037,62 @@ if (ExportToDot) exportToDot("ccg.cloned.dot"); - return false; + bool Changed = assignFunctions(); + + if (DumpCCG) { + dbgs() << "CCG after assigning function clones:\n"; + dbgs() << *this; + } + if (ExportToDot) + exportToDot("ccg.clonefuncassign.dot"); + + return Changed; } bool PGHOContextDisambiguation::processModule( - Module &M) { + Module &M, + function_ref OREGetter) { bool Changed = false; - ModuleCallsiteContextGraph CCG(M); + // If we have an import summary, then the cloning decisions were made during + // the thin link on the index. Apply them and return. + if (ImportSummary) { + Changed = applyImport(M); + return Changed; + } + + ModuleCallsiteContextGraph CCG(M, OREGetter); Changed = CCG.process(); return Changed; } +PGHOContextDisambiguation::PGHOContextDisambiguation( + const ModuleSummaryIndex *Summary) + : ImportSummary(Summary) { + // The PGHOImportSummary should only be used for testing ThinLTO distributed + // backend handling via opt, in which case we don't have a summary from the + // pass pipeline. + assert(!ImportSummary || PGHOImportSummary.empty()); + if (!ImportSummary && !PGHOImportSummary.empty()) { + ExitOnError ExitOnErr("-pgho-import-summary: " + PGHOImportSummary + ": "); + auto ReadSummaryFile = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(PGHOImportSummary))); + if (Expected> SummaryOrErr = + getModuleSummaryIndex(*ReadSummaryFile)) { + ImportSummaryForTesting = std::move(*SummaryOrErr); + ImportSummary = ImportSummaryForTesting.get(); + } + } +} + PreservedAnalyses PGHOContextDisambiguation::run(Module &M, ModuleAnalysisManager &AM) { - if (!processModule(M)) + auto &FAM = AM.getResult(M).getManager(); + auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { + return FAM.getResult(*F); + }; + if (!processModule(M, OREGetter)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -2115,4 +3103,5 @@ isPrevailing) { IndexCallsiteContextGraph CCG(Index, isPrevailing); CCG.process(); + Index.setWithPGHOContextDisambiguation(); } diff --git a/llvm/test/ThinLTO/X86/pgho-basic.ll b/llvm/test/ThinLTO/X86/pgho-basic.ll --- a/llvm/test/ThinLTO/X86/pgho-basic.ll +++ b/llvm/test/ThinLTO/X86/pgho-basic.ll @@ -1,5 +1,5 @@ ;; Test callsite context graph generation for simple call graph with -;; two memprof contexts and no inlining. +;; two memprof contexts and no inlining, as well as graph and IR cloning. ;; ;; Original code looks like: ;; @@ -37,12 +37,44 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should have cloned bar, baz, and foo, for the cold memory allocation. +; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + +;; Run ThinLTO backend +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + ; ModuleID = 'pgho-basic.ll' source_filename = "pgho-basic.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -150,6 +182,7 @@ !15 = !{i64 -5964873800580613432} !16 = !{i64 2732490490862098848} + ; DUMP: CCG before cloning: ; DUMP: Callsite Context Graph: ; DUMP: Node [[BAR:0x[a-z0-9]+]] @@ -275,6 +308,52 @@ ; DUMP: CallerEdges: +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 +; REMARKS: created clone _Z3barv.pgho.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3bazv.pgho.1 +; REMARKS: call in clone _Z3bazv.pgho.1 assigned to call function clone _Z3barv.pgho.1 +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3bazv.pgho.1 + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: %call = call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: %call = call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: %call = call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.pgho.1() +; IR: %call = call {{.*}} @_Z3barv.pgho.1() +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call = call {{.*}} @_Z3bazv.pgho.1() +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 pgho-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 3 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> alloc",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold @@ -306,3 +385,9 @@ ; DOTCLONED: N[[FOO]] -> N[[BAZ]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default ; DOTCLONED: N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default ; DOTCLONED: } + + +; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6988045695824228603, {{.*}} callsites: ((callee: ^[[BAZ:[0-9]+]], clones: (0, 1) +; DISTRIB: ^[[BAR:[0-9]+]] = gv: (guid: 10756268697391741933, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1) +; DISTRIB: ^[[BAZ]] = gv: (guid: 17547784407117670007, {{.*}} callsites: ((callee: ^[[BAR]], clones: (0, 1) diff --git a/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll --- a/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -58,13 +59,46 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST ;; We should clone D once for the cold allocations via C. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST +;; We should clone D once for the cold allocations via C. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Check distributed index +; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB + +;; Run ThinLTO backend +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + ; ModuleID = 'duplicate-context-ids.ll' source_filename = "duplicate-context-ids.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -347,6 +381,49 @@ ; DUMP: CallerEdges: +; REMARKS: created clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1Dv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.pgho.1 + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: %call = call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: %call.i = call {{.*}} @_Z1Dv.pgho.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: %call.i = call {{.*}} @_Z1Dv.pgho.1() +; IR: define dso_local {{.*}} @main +; IR: %call = call {{.*}} @_Z1Bv() +; IR: %call1 = call {{.*}} @_Z1Ev() +; IR: %call2 = call {{.*}} @_Z1Fv() +; IR: define internal {{.*}} @_Z1Dv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 pgho-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOTPRE: digraph CallsiteContextGraph { ; DOTPRE: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> alloc",tooltip="N[[D]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOTPRE: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default @@ -384,3 +461,9 @@ ; DOTCLONED: N[[B]] -> N[[D2]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold ; DOTCLONED: N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default ; DOTCLONED: } + +; DISTRIB: ^[[D:[0-9]+]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold) +; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1) +; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[B]], clones: (0), {{.*}} (callee: ^[[E:[0-9]+]], clones: (0), {{.*}} (callee: ^[[F:[0-9]+]], clones: (0) +; DISTRIB: ^[[F]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0) +; DISTRIB: ^[[E]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1) diff --git a/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll b/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll @@ -0,0 +1,418 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. + + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +;; Run ThinLTO backend +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + +; ModuleID = 'funcassigncloning.ll' +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1EPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !14 + %0 = load ptr, ptr %buf1.addr, align 8 + store ptr %call, ptr %0, align 8 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !15, !callsite !22 + %1 = load ptr, ptr %buf2.addr, align 8 + store ptr %call1, ptr %1, align 8 + ret void +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1BPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !23 + ret void +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1CPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !24 + ret void +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1DPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !25 + ret void +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %cold1 = alloca ptr, align 8 + %cold2 = alloca ptr, align 8 + %default1 = alloca ptr, align 8 + %default2 = alloca ptr, align 8 + %default3 = alloca ptr, align 8 + %default4 = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + call void @_Z1BPPcS0_(ptr noundef %default1, ptr noundef %default2), !callsite !26 + call void @_Z1CPPcS0_(ptr noundef %default3, ptr noundef %cold1), !callsite !27 + call void @_Z1DPPcS0_(ptr noundef %cold2, ptr noundef %default4), !callsite !28 + %0 = load ptr, ptr %cold1, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %cold2, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %default1, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %default2, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %default3, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %default4, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %default1, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #7 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %default2, align 8 + %isnull1 = icmp eq ptr %7, null + br i1 %isnull1, label %delete.end3, label %delete.notnull2 + +delete.notnull2: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #7 + br label %delete.end3 + +delete.end3: ; preds = %delete.notnull2, %delete.end + %8 = load ptr, ptr %default3, align 8 + %isnull4 = icmp eq ptr %8, null + br i1 %isnull4, label %delete.end6, label %delete.notnull5 + +delete.notnull5: ; preds = %delete.end3 + call void @_ZdaPv(ptr noundef %8) #7 + br label %delete.end6 + +delete.end6: ; preds = %delete.notnull5, %delete.end3 + %9 = load ptr, ptr %default4, align 8 + %isnull7 = icmp eq ptr %9, null + br i1 %isnull7, label %delete.end9, label %delete.notnull8 + +delete.notnull8: ; preds = %delete.end6 + call void @_ZdaPv(ptr noundef %9) #7 + br label %delete.end9 + +delete.end9: ; preds = %delete.notnull8, %delete.end6 + %call = call i32 @sleep(i32 noundef 10) + %10 = load ptr, ptr %cold1, align 8 + %isnull10 = icmp eq ptr %10, null + br i1 %isnull10, label %delete.end12, label %delete.notnull11 + +delete.notnull11: ; preds = %delete.end9 + call void @_ZdaPv(ptr noundef %10) #7 + br label %delete.end12 + +delete.end12: ; preds = %delete.notnull11, %delete.end9 + %11 = load ptr, ptr %cold2, align 8 + %isnull13 = icmp eq ptr %11, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end12 + call void @_ZdaPv(ptr noundef %11) #7 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end12 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10, !12} +!8 = !{!9, !"cold"} +!9 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!10 = !{!11, !"notcold"} +!11 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!12 = !{!13, !"notcold"} +!13 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!14 = !{i64 -3461278137325233666} +!15 = !{!16, !18, !20} +!16 = !{!17, !"notcold"} +!17 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!18 = !{!19, !"cold"} +!19 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!20 = !{!21, !"notcold"} +!21 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!22 = !{i64 -1415475215210681400} +!23 = !{i64 -2441057035866683071} +!24 = !{i64 -3483158674395044949} +!25 = !{i64 -7799663586031895603} +!26 = !{i64 4256801922104815624} +!27 = !{i64 6438520854747849124} +!28 = !{i64 -8402480891374135967} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[C]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 2 StackIds: 0 +; DUMP: AllocType 1 StackIds: 1 +; DUMP: AllocType 1 StackIds: 2 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[D]] +; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 6 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: Versions: 1 MIB: +; DUMP: AllocType 1 StackIds: 2 +; DUMP: AllocType 2 StackIds: 1 +; DUMP: AllocType 1 StackIds: 0 +; DUMP: (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.pgho.1 +; REMARKS: created clone _Z1EPPcS0_.pgho.2 +; REMARKS: created clone _Z1EPPcS0_.pgho.3 +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.2 + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.pgho.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.pgho.2( +; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.2( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.3( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 8 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 pgho-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 4 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/pgho-indirectcall.ll b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll --- a/llvm/test/ThinLTO/X86/pgho-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -59,13 +59,45 @@ ; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \ +; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should only create a single clone of foo, for the direct call +;; from main allocating cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Run ThinLTO backend +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + ; ModuleID = 'indirectcall.ll' source_filename = "indirectcall.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -539,6 +571,47 @@ ; DUMP: CallerEdges: +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3foov.pgho.1 marked with memprof allocation attribute cold + + +; IR: define internal {{.*}} @_Z3barP1A( +; IR: %call = call {{.*}} %1( +; IR: define {{.*}} @main( +; IR: %call = call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: %call2 = call {{.*}} @_Z3barP1A( +; IR: %call3 = call {{.*}} @_Z3barP1A( +; IR: %call4 = call {{.*}} @_Z3barP1A( +; IR: %call5 = call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_ZN1A1xEv( +; IR: %call = call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_ZN1B1xEv( +; IR: %call = call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_Z3foov() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 pgho-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> alloc",tooltip="N[[FOO]] ContextIds: 2 4 6 1 3 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold diff --git a/llvm/test/ThinLTO/X86/pgho-inlined.ll b/llvm/test/ThinLTO/X86/pgho-inlined.ll --- a/llvm/test/ThinLTO/X86/pgho-inlined.ll +++ b/llvm/test/ThinLTO/X86/pgho-inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -46,13 +47,45 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS --check-prefix=STATS-BE \ +; RUN: --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_ZdaPv, \ +; RUN: -r=%t.o,sleep, \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ +; RUN: --check-prefix=STATS + +; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT +;; We should create clones for foo and bar for the call from main to allocate +;; cold memory. +; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED + +;; Run ThinLTO backend +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \ +; RUN: --check-prefix=REMARKS + ; ModuleID = 'inlined.ll' source_filename = "inlined.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -311,6 +344,52 @@ ; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2 +; REMARKS: created clone _Z3barv.pgho.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3barv.pgho.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 + + +; IR: define internal {{.*}} @_Z3barv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: %call.i = call {{.*}} @_Z3barv() +; IR: define dso_local {{.*}} @main(i32 noundef %argc, ptr noundef %argv) +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: %call = call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: define internal {{.*}} @_Z3barv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call.i = call {{.*}} @_Z3barv.pgho.1() +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 2 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-INPROCESS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 3 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +; STATS: 2 pgho-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 2 pgho-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-INPROCESS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 2 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3bazv -\> alloc",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll --- a/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll @@ -1,5 +1,5 @@ ;; Test callsite context graph generation for simple call graph with -;; two memprof contexts and no inlining. +;; two memprof contexts and no inlining, as well as graph and IR cloning. ;; ;; Original code looks like: ;; @@ -32,7 +32,9 @@ ; RUN: opt -passes=pgho-context-disambiguation \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -261,6 +263,48 @@ ; DUMP: CallerEdges: +; REMARKS: created clone _Z3barv.pgho.1 +; REMARKS: created clone _Z3bazv.pgho.1 +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3bazv.pgho.1 +; REMARKS: call in clone _Z3bazv.pgho.1 assigned to call function clone _Z3barv.pgho.1 +; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv +; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: %call = call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: %call = call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: %call = call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.pgho.1() +; IR: %call = call {{.*}} @_Z3barv.pgho.1() +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call = call {{.*}} @_Z3bazv.pgho.1() +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> _Znam",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll --- a/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll +++ b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll @@ -1,7 +1,8 @@ ;; Test callsite context graph generation for call graph with with MIBs ;; that have pruned contexts that partially match multiple inlined ;; callsite contexts, requiring duplication of context ids and nodes -;; while matching callsite nodes onto the graph. +;; while matching callsite nodes onto the graph. Also tests graph and IR +;; cloning. ;; ;; Original code looks like: ;; @@ -53,7 +54,9 @@ ; RUN: opt -passes=pgho-context-disambiguation \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST @@ -330,6 +333,44 @@ ; DUMP: CallerEdges: +; REMARKS: created clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.pgho.1 +; REMARKS: call in clone _Z1Dv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: %call = call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: %call.i = call {{.*}} @_Z1Dv.pgho.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: %call.i = call {{.*}} @_Z1Dv.pgho.1() +; IR: define dso_local {{.*}} @main +; IR: %call = call {{.*}} @_Z1Bv() +; IR: %call1 = call {{.*}} @_Z1Ev() +; IR: %call2 = call {{.*}} @_Z1Fv() +; IR: define internal {{.*}} @_Z1Dv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis + + ; DOTPRE: digraph CallsiteContextGraph { ; DOTPRE: N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> _Znam",tooltip="N[[D]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOTPRE: N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll @@ -0,0 +1,374 @@ +;; Test context disambiguation for a callgraph containing multiple memprof +;; contexts and no inlining, where we need to perform additional cloning +;; during function assignment/cloning to handle the combination of contexts +;; to 2 different allocations. +;; +;; void E(char **buf1, char **buf2) { +;; *buf1 = new char[10]; +;; *buf2 = new char[10]; +;; } +;; +;; void B(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void C(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; +;; void D(char **buf1, char **buf2) { +;; E(buf1, buf2); +;; } +;; int main(int argc, char **argv) { +;; char *cold1, *cold2, *default1, *default2, *default3, *default4; +;; B(&default1, &default2); +;; C(&default3, &cold1); +;; D(&cold2, &default4); +;; memset(cold1, 0, 10); +;; memset(cold2, 0, 10); +;; memset(default1, 0, 10); +;; memset(default2, 0, 10); +;; memset(default3, 0, 10); +;; memset(default4, 0, 10); +;; delete[] default1; +;; delete[] default2; +;; delete[] default3; +;; delete[] default4; +;; sleep(10); +;; delete[] cold1; +;; delete[] cold2; +;; return 0; +;; } +;; +;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the +;; memory freed after sleep(10) results in cold lifetimes. + +; RUN: opt -passes=pgho-context-disambiguation \ +; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS + +; ModuleID = 'funcassigncloning.ll' +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1EPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !14 + %0 = load ptr, ptr %buf1.addr, align 8 + store ptr %call, ptr %0, align 8 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !15, !callsite !22 + %1 = load ptr, ptr %buf2.addr, align 8 + store ptr %call1, ptr %1, align 8 + ret void +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1BPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !23 + ret void +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1CPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !24 + ret void +} + +; Function Attrs: mustprogress noinline optnone uwtable +define internal void @_Z1DPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 { +entry: + %buf1.addr = alloca ptr, align 8 + %buf2.addr = alloca ptr, align 8 + store ptr %buf1, ptr %buf1.addr, align 8 + store ptr %buf2, ptr %buf2.addr, align 8 + %0 = load ptr, ptr %buf1.addr, align 8 + %1 = load ptr, ptr %buf2.addr, align 8 + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !25 + ret void +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %cold1 = alloca ptr, align 8 + %cold2 = alloca ptr, align 8 + %default1 = alloca ptr, align 8 + %default2 = alloca ptr, align 8 + %default3 = alloca ptr, align 8 + %default4 = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + call void @_Z1BPPcS0_(ptr noundef %default1, ptr noundef %default2), !callsite !26 + call void @_Z1CPPcS0_(ptr noundef %default3, ptr noundef %cold1), !callsite !27 + call void @_Z1DPPcS0_(ptr noundef %cold2, ptr noundef %default4), !callsite !28 + %0 = load ptr, ptr %cold1, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false) + %1 = load ptr, ptr %cold2, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false) + %2 = load ptr, ptr %default1, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false) + %3 = load ptr, ptr %default2, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false) + %4 = load ptr, ptr %default3, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false) + %5 = load ptr, ptr %default4, align 8 + call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false) + %6 = load ptr, ptr %default1, align 8 + %isnull = icmp eq ptr %6, null + br i1 %isnull, label %delete.end, label %delete.notnull + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %6) #7 + br label %delete.end + +delete.end: ; preds = %delete.notnull, %entry + %7 = load ptr, ptr %default2, align 8 + %isnull1 = icmp eq ptr %7, null + br i1 %isnull1, label %delete.end3, label %delete.notnull2 + +delete.notnull2: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %7) #7 + br label %delete.end3 + +delete.end3: ; preds = %delete.notnull2, %delete.end + %8 = load ptr, ptr %default3, align 8 + %isnull4 = icmp eq ptr %8, null + br i1 %isnull4, label %delete.end6, label %delete.notnull5 + +delete.notnull5: ; preds = %delete.end3 + call void @_ZdaPv(ptr noundef %8) #7 + br label %delete.end6 + +delete.end6: ; preds = %delete.notnull5, %delete.end3 + %9 = load ptr, ptr %default4, align 8 + %isnull7 = icmp eq ptr %9, null + br i1 %isnull7, label %delete.end9, label %delete.notnull8 + +delete.notnull8: ; preds = %delete.end6 + call void @_ZdaPv(ptr noundef %9) #7 + br label %delete.end9 + +delete.end9: ; preds = %delete.notnull8, %delete.end6 + %call = call i32 @sleep(i32 noundef 10) + %10 = load ptr, ptr %cold1, align 8 + %isnull10 = icmp eq ptr %10, null + br i1 %isnull10, label %delete.end12, label %delete.notnull11 + +delete.notnull11: ; preds = %delete.end9 + call void @_ZdaPv(ptr noundef %10) #7 + br label %delete.end12 + +delete.end12: ; preds = %delete.notnull11, %delete.end9 + %11 = load ptr, ptr %cold2, align 8 + %isnull13 = icmp eq ptr %11, null + br i1 %isnull13, label %delete.end15, label %delete.notnull14 + +delete.notnull14: ; preds = %delete.end12 + call void @_ZdaPv(ptr noundef %11) #7 + br label %delete.end15 + +delete.end15: ; preds = %delete.notnull14, %delete.end12 + ret i32 0 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{i32 7, !"Dwarf Version", i32 5} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 8, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!8, !10, !12} +!8 = !{!9, !"cold"} +!9 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!10 = !{!11, !"notcold"} +!11 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!12 = !{!13, !"notcold"} +!13 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!14 = !{i64 -3461278137325233666} +!15 = !{!16, !18, !20} +!16 = !{!17, !"notcold"} +!17 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!18 = !{!19, !"cold"} +!19 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!20 = !{!21, !"notcold"} +!21 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!22 = !{i64 -1415475215210681400} +!23 = !{i64 -2441057035866683071} +!24 = !{i64 -3483158674395044949} +!25 = !{i64 -7799663586031895603} +!26 = !{i64 4256801922104815624} +!27 = !{i64 6438520854747849124} +!28 = !{i64 -8402480891374135967} + + +;; Originally we create a single clone of each call to new from E, since each +;; allocates cold memory for a single caller. + +; DUMP: CCG after cloning: +; DUMP: Callsite Context Graph: +; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 2 3 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]] + +; DUMP: Node [[C]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 2 5 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2 +; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: CallerEdges: + +; DUMP: Node [[B]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 3 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3 +; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW1CLONE]] +; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 1 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1 +; DUMP: Clone of [[ENEW1ORIG]] + +; DUMP: Node [[D]] +; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0) +; DUMP: AllocTypes: NotColdCold +; DUMP: ContextIds: 6 1 +; DUMP: CalleeEdges: +; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: CallerEdges: + +; DUMP: Node [[ENEW2ORIG]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: NotCold +; DUMP: ContextIds: 4 6 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4 +; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6 +; DUMP: Clones: [[ENEW2CLONE]] + +; DUMP: Node [[ENEW2CLONE]] +; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0) +; DUMP: AllocTypes: Cold +; DUMP: ContextIds: 5 +; DUMP: CalleeEdges: +; DUMP: CallerEdges: +; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5 +; DUMP: Clone of [[ENEW2ORIG]] + + +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.pgho.1 +; REMARKS: created clone _Z1EPPcS0_.pgho.2 +; REMARKS: created clone _Z1EPPcS0_.pgho.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.2 +; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.3 +; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_ +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.pgho.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.pgho.2( +;; Transient clone that will get removed as it ends up with no callers. +;; Its calls to new never get updated with a memprof attribute as a result. +; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.1( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]] +; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.2( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.3( +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[DEFAULT]] = { builtin allocsize(0) } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll --- a/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll +++ b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll @@ -1,7 +1,7 @@ ;; Tests callsite context graph generation for call graph containing indirect ;; calls. Currently this should result in conservative behavior, such that the ;; indirect call receives a null call in its graph node, to prevent subsequent -;; cloning. +;; cloning. Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -52,7 +52,9 @@ ; RUN: opt -passes=pgho-context-disambiguation \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call @@ -511,6 +513,43 @@ ; DUMP: CallerEdges: +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov.pgho.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold + + +; IR: define internal {{.*}} @_Z3barP1A( +; IR: %call = call {{.*}} %1( +; IR: define {{.*}} @main( +; IR: %call = call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: %call2 = call {{.*}} @_Z3barP1A( +; IR: %call3 = call {{.*}} @_Z3barP1A( +; IR: %call4 = call {{.*}} @_Z3barP1A( +; IR: %call5 = call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_ZN1A1xEv( +; IR: %call = call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_ZN1B1xEv( +; IR: %call = call {{.*}} @_Z3foov() +; IR: define internal {{.*}} @_Z3foov() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> _Znam",tooltip="N[[FOO]] ContextIds: 2 4 6 1 3 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll --- a/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll +++ b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for call graph with two memprof ;; contexts and partial inlining, requiring generation of a new fused node to ;; represent the inlined sequence while matching callsite nodes onto the graph. +;; Also tests graph and IR cloning. ;; ;; Original code looks like: ;; @@ -41,7 +42,9 @@ ; RUN: opt -passes=pgho-context-disambiguation \ ; RUN: -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \ ; RUN: -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \ -; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP +; RUN: -stats -pass-remarks=pgho-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ +; RUN: --check-prefix=STATS --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate @@ -291,6 +294,42 @@ ; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4 +; REMARKS: created clone _Z3barv.pgho.1 +; REMARKS: created clone _Z3foov.pgho.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1 +; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3barv.pgho.1 +; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold +; REMARKS: call in clone main assigned to call function clone _Z3foov +; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold + + +; IR: define internal {{.*}} @_Z3barv() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: %call.i = call {{.*}} @_Z3barv() +; IR: define dso_local {{.*}} @main(i32 noundef %argc, ptr noundef %argv) +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: %call = call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: %call1 = call {{.*}} @_Z3foov.pgho.1() +; IR: define internal {{.*}} @_Z3barv.pgho.1() +; IR: %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.pgho.1() +; IR: %call.i = call {{.*}} @_Z3barv.pgho.1() +; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } +; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } + + +; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS: 2 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS: 2 pgho-context-disambiguation - Number of function clones created during whole program analysis + + ; DOT: digraph CallsiteContextGraph { ; DOT: N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc2\n_Z3bazv -\> _Znam",tooltip="N[[BAZ]] ContextIds: 4 3",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold ; DOT: N[[FOO2:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO2]] ContextIds: 4 3",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold