diff --git a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h --- a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h +++ b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h @@ -99,6 +99,10 @@ ImmutablePass * createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index); +/// Returns true if the instruction could have memprof metadata, used to ensure +/// consistency between summary analysis and the ThinLTO backend processing. +bool mayHaveMemprofSummary(const CallBase *CB); + } // end namespace llvm #endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h --- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h +++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h @@ -18,13 +18,13 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringSet.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/PassManager.h" #include namespace llvm { class GlobalValueSummary; class Module; -class ModuleSummaryIndex; class OptimizationRemarkEmitter; class MemProfContextDisambiguation @@ -34,8 +34,19 @@ Module &M, function_ref OREGetter); + /// In the ThinLTO backend, apply the cloning decisions in ImportSummary to + /// the IR. + bool applyImport(Module &M); + + /// Import summary containing cloning decisions for the ThinLTO backend. + const ModuleSummaryIndex *ImportSummary; + + // Owns the import summary specified by internal options for testing the + // ThinLTO backend via opt (to simulate distributed ThinLTO). + std::unique_ptr ImportSummaryForTesting; + public: - MemProfContextDisambiguation() {} + MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr); PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -284,6 +284,10 @@ std::vector Callsites; std::vector Allocs; +#ifndef NDEBUG + DenseSet CallsThatMayHaveMemprofSummary; +#endif + bool HasInlineAsmMaybeReferencingInternal = false; bool HasIndirBranchToBlockAddress = false; bool HasUnknownCall = false; @@ -427,6 +431,10 @@ .updateHotness(getHotness(Candidate.Count, PSI)); } + // Summarize memprof related metadata. This is only needed for ThinLTO. + if (!IsThinLTO) + continue; + // TODO: Skip indirect calls for now. Need to handle these better, likely // by creating multiple Callsites, one per target, then speculatively // devirtualize while applying clone info in the ThinLTO backends. This @@ -437,6 +445,14 @@ if (!CalledFunction) continue; + // Ensure we keep this analysis in sync with the handling in the ThinLTO + // backend (see MemProfContextDisambiguation::applyImport). Save this call + // so that we can skip it in checking the reverse case later. + assert(mayHaveMemprofSummary(CB)); +#ifndef NDEBUG + CallsThatMayHaveMemprofSummary.insert(CB); +#endif + // Compute the list of stack ids first (so we can trim them from the stack // ids on any MIBs). CallStack InstCallsite( @@ -546,6 +562,25 @@ ? CalleeInfo::HotnessType::Cold : CalleeInfo::HotnessType::Critical); +#ifndef NDEBUG + // Make sure that all calls we decided could not have memprof summaries get a + // false value for mayHaveMemprofSummary, to ensure that this handling remains + // in sync with the ThinLTO backend handling. + if (IsThinLTO) { + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + const auto *CB = dyn_cast(&I); + if (!CB) + continue; + // We already checked these above. + if (CallsThatMayHaveMemprofSummary.count(CB)) + continue; + assert(!mayHaveMemprofSummary(CB)); + } + } + } +#endif + bool NonRenamableLocal = isNonRenamableLocal(F); bool NotEligibleForImport = NonRenamableLocal || HasInlineAsmMaybeReferencingInternal || @@ -1042,3 +1077,36 @@ INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info", "Module summary info", false, true) + +bool llvm::mayHaveMemprofSummary(const CallBase *CB) { + if (!CB) + return false; + if (CB->isDebugOrPseudoInst()) + return false; + auto *CI = dyn_cast(CB); + auto *CalledValue = CB->getCalledOperand(); + auto *CalledFunction = CB->getCalledFunction(); + if (CalledValue && !CalledFunction) { + CalledValue = CalledValue->stripPointerCasts(); + // Stripping pointer casts can reveal a called function. + CalledFunction = dyn_cast(CalledValue); + } + // Check if this is an alias to a function. If so, get the + // called aliasee for the checks below. + if (auto *GA = dyn_cast(CalledValue)) { + assert(!CalledFunction && + "Expected null called function in callsite for alias"); + CalledFunction = dyn_cast(GA->getAliaseeObject()); + } + // Check if this is a direct call to a known function or a known + // intrinsic, or an indirect call with profile data. + if (CalledFunction) { + if (CI && CalledFunction->isIntrinsic()) + return false; + } else { + // TODO: For now skip indirect calls. See comments in + // computeFunctionSummary for what is needed to handle this. + return false; + } + return true; +} diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1531,6 +1531,11 @@ ModulePassManager MPM; if (ImportSummary) { + // For ThinLTO we must apply the context disambiguation decisions early, to + // ensure we can correctly match the callsites to summary data. + if (EnableMemProfContextDisambiguation) + MPM.addPass(MemProfContextDisambiguation(ImportSummary)); + // These passes import type identifier resolutions for whole-program // devirtualization and CFI. They must run early because other passes may // disturb the specific instruction patterns that these passes look for, diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -52,10 +53,30 @@ STATISTIC(FunctionClonesAnalysis, "Number of function clones created during whole program analysis"); +STATISTIC(FunctionClonesThinBackend, + "Number of function clones created during ThinLTO backend"); +STATISTIC(FunctionsClonedThinBackend, + "Number of functions that had clones created during ThinLTO backend"); STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " "cloned) during whole program analysis"); STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " "during whole program analysis"); +STATISTIC(AllocTypeNotColdThinBackend, + "Number of not cold static allocations (possibly cloned) during " + "ThinLTO backend"); +STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " + "(possibly cloned) during ThinLTO backend"); +STATISTIC(OrigAllocsThinBackend, + "Number of original (not cloned) allocations with memprof profiles " + "during ThinLTO backend"); +STATISTIC( + AllocVersionsThinBackend, + "Number of allocation versions (including clones) during ThinLTO backend"); +STATISTIC(MaxAllocVersionsThinBackend, + "Maximum number of allocation versions created for an original " + "allocation during ThinLTO backend"); +STATISTIC(UnclonableAllocsThinBackend, + "Number of unclonable ambigous allocations during ThinLTO backend"); static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -78,6 +99,11 @@ VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes.")); +static cl::opt MemProfImportSummary( + "memprof-import-summary", + cl::desc("Import summary to use for testing the ThinLTO backend via opt"), + cl::Hidden); + /// CRTP base for graphs built from either IR or ThinLTO summary index. /// /// The graph represents the call contexts in all memprof metadata on allocation @@ -109,8 +135,8 @@ /// Assign callsite clones to functions, cloning functions as needed to /// accommodate the combinations of their callsite clones reached by callers. /// For regular LTO this clones functions and callsites in the IR, but for - /// ThinLTO the cloning decisions are noted in the summaries and applied - /// later. + /// ThinLTO the cloning decisions are noted in the summaries and later applied + /// in applyImport. bool assignFunctions(); void dump() const; @@ -2779,6 +2805,358 @@ return Changed; } +static SmallVector, 4> createFunctionClones( + Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, + std::map> + &FuncToAliasMap) { + // The first "clone" is the original copy, we should only call this if we + // needed to create new clones. + assert(NumClones > 1); + SmallVector, 4> VMaps; + VMaps.reserve(NumClones - 1); + FunctionsClonedThinBackend++; + for (unsigned I = 1; I < NumClones; I++) { + VMaps.emplace_back(std::make_unique()); + auto *NewF = CloneFunction(&F, *VMaps.back()); + FunctionClonesThinBackend++; + // Strip memprof and callsite metadata from clone as they are no longer + // needed. + for (auto &BB : *NewF) { + for (auto &Inst : BB) { + Inst.setMetadata(LLVMContext::MD_memprof, nullptr); + Inst.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + std::string Name = getMemProfFuncName(F.getName(), I); + auto *PrevF = M.getFunction(Name); + if (PrevF) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevF->isDeclaration()); + NewF->takeName(PrevF); + PrevF->replaceAllUsesWith(NewF); + PrevF->eraseFromParent(); + } else + NewF->setName(Name); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) + << "created clone " << ore::NV("NewFunction", NewF)); + + // Now handle aliases to this function, and clone those as well. + if (!FuncToAliasMap.count(&F)) + continue; + for (auto *A : FuncToAliasMap[&F]) { + std::string Name = getMemProfFuncName(A->getName(), I); + auto *PrevA = M.getNamedAlias(Name); + auto *NewA = GlobalAlias::create(A->getValueType(), + A->getType()->getPointerAddressSpace(), + A->getLinkage(), Name, NewF); + NewA->copyAttributesFrom(A); + if (PrevA) { + // We might have created this when adjusting callsite in another + // function. It should be a declaration. + assert(PrevA->isDeclaration()); + NewA->takeName(PrevA); + PrevA->replaceAllUsesWith(NewA); + PrevA->eraseFromParent(); + } + } + } + return VMaps; +} + +// Locate the summary for F. This is complicated by the fact that it might +// have been internalized or promoted. +static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, + const ModuleSummaryIndex *ImportSummary) { + // FIXME: Ideally we would retain the original GUID in some fashion on the + // function (e.g. as metadata), but for now do our best to locate the + // summary without that information. + ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); + if (!TheFnVI) + // See if theFn was internalized, by checking index directly with + // original name (this avoids the name adjustment done by getGUID() for + // internal symbols). + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); + if (TheFnVI) + return TheFnVI; + // Now query with the original name before any promotion was performed. + StringRef OrigName = + ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); + std::string OrigId = GlobalValue::getGlobalIdentifier( + OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); + TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); + if (TheFnVI) + return TheFnVI; + // Could be a promoted local imported from another module. We need to pass + // down more info here to find the original module id. For now, try with + // the OrigName which might have been stored in the OidGuidMap in the + // index. This would not work if there were same-named locals in multiple + // modules, however. + auto OrigGUID = + ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); + if (OrigGUID) + TheFnVI = ImportSummary->getValueInfo(OrigGUID); + return TheFnVI; +} + +bool MemProfContextDisambiguation::applyImport(Module &M) { + assert(ImportSummary); + bool Changed = false; + + auto IsMemProfClone = [](const Function &F) { + return F.getName().contains(MemProfCloneSuffix); + }; + + // We also need to clone any aliases that reference cloned functions, because + // the modified callsites may invoke via the alias. Keep track of the aliases + // for each function. + std::map> + FuncToAliasMap; + for (auto &A : M.aliases()) { + auto *Aliasee = A.getAliaseeObject(); + if (auto *F = dyn_cast(Aliasee)) + FuncToAliasMap[F].insert(&A); + } + + for (auto &F : M) { + if (F.isDeclaration() || IsMemProfClone(F)) + continue; + + OptimizationRemarkEmitter ORE(&F); + + SmallVector, 4> VMaps; + bool ClonesCreated = false; + unsigned NumClonesCreated = 0; + auto CloneFuncIfNeeded = [&](unsigned NumClones) { + // We should at least have version 0 which is the original copy. + assert(NumClones > 0); + // If only one copy needed use original. + if (NumClones == 1) + return; + // If we already performed cloning of this function, confirm that the + // requested number of clones matches (the thin link should ensure the + // number of clones for each constituent callsite is consistent within + // each function), before returning. + if (ClonesCreated) { + assert(NumClonesCreated == NumClones); + return; + } + VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); + // The first "clone" is the original copy, which doesn't have a VMap. + assert(VMaps.size() == NumClones - 1); + Changed = true; + ClonesCreated = true; + NumClonesCreated = NumClones; + }; + + // Locate the summary for F. + ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); + // If not found, this could be an imported local (see comment in + // findValueInfoForFunc). Skip for now as it will be cloned in its original + // module (where it would have been promoted to global scope so should + // satisfy any reference in this module). + if (!TheFnVI) + continue; + + auto *GVSummary = + ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); + if (!GVSummary) + // Must have been imported, use the first summary (might be multiple if + // this was a linkonce_odr). + GVSummary = TheFnVI.getSummaryList().front().get(); + + // If this was an imported alias skip it as we won't have the function + // summary, and it should be cloned in the original module. + if (isa(GVSummary)) + continue; + + auto *FS = cast(GVSummary->getBaseObject()); + + if (FS->allocs().empty() && FS->callsites().empty()) + continue; + + auto SI = FS->callsites().begin(); + auto AI = FS->allocs().begin(); + + // Assume for now that the instructions are in the exact same order + // as when the summary was created, but confirm this is correct by + // matching the stack ids. + for (auto &BB : F) { + for (auto &I : BB) { + auto *CB = dyn_cast(&I); + // Same handling as when creating module summary. + if (!mayHaveMemprofSummary(CB)) + continue; + + CallStack CallsiteContext( + I.getMetadata(LLVMContext::MD_callsite)); + auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); + + // Include allocs that were already assigned a memprof function + // attribute in the statistics. + if (CB->getAttributes().hasFnAttr("memprof")) { + assert(!MemProfMD); + CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + OrigAllocsThinBackend++; + AllocVersionsThinBackend++; + if (!MaxAllocVersionsThinBackend) + MaxAllocVersionsThinBackend = 1; + // Remove any remaining callsite metadata and we can skip the rest of + // the handling for this instruction, since no cloning needed. + I.setMetadata(LLVMContext::MD_callsite, nullptr); + continue; + } + + if (MemProfMD) { + // Consult the next alloc node. + assert(AI != FS->allocs().end()); + auto &AllocNode = *(AI++); + + // Sanity check that the MIB stack ids match between the summary and + // instruction metadata. + auto MIBIter = AllocNode.MIBs.begin(); + for (auto &MDOp : MemProfMD->operands()) { + assert(MIBIter != AllocNode.MIBs.end()); + auto StackIdIndexIter = MIBIter->StackIdIndices.begin(); + auto *MIBMD = cast(MDOp); + MDNode *StackMDNode = getMIBStackNode(MIBMD); + assert(StackMDNode); + SmallVector StackIdsFromMetadata; + CallStack StackContext(StackMDNode); + for (auto ContextIter = + StackContext.beginAfterSharedPrefix(CallsiteContext); + ContextIter != StackContext.end(); ++ContextIter) { + // If this is a direct recursion, simply skip the duplicate + // entries, to be consistent with how the summary ids were + // generated during ModuleSummaryAnalysis. + if (!StackIdsFromMetadata.empty() && + StackIdsFromMetadata.back() == *ContextIter) + continue; + assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + *ContextIter); + StackIdIndexIter++; + } + MIBIter++; + } + + // Perform cloning if not yet done. + CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); + + OrigAllocsThinBackend++; + AllocVersionsThinBackend += AllocNode.Versions.size(); + if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) + MaxAllocVersionsThinBackend = AllocNode.Versions.size(); + + // If there is only one version that means we didn't end up + // considering this function for cloning, and in that case the alloc + // will still be none type or should have gotten the default NotCold. + // Skip that after calling clone helper since that does some sanity + // checks that confirm we haven't decided yet that we need cloning. + if (AllocNode.Versions.size() == 1) { + assert((AllocationType)AllocNode.Versions[0] == + AllocationType::NotCold || + (AllocationType)AllocNode.Versions[0] == + AllocationType::None); + UnclonableAllocsThinBackend++; + continue; + } + + // All versions should have a singular allocation type. + assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { + return Type == ((uint8_t)AllocationType::NotCold | + (uint8_t)AllocationType::Cold); + })); + + // Update the allocation types per the summary info. + for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { + // Ignore any that didn't get an assigned allocation type. + if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) + continue; + AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; + AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ + : AllocTypeNotColdThinBackend++; + std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); + auto A = llvm::Attribute::get(F.getContext(), "memprof", + AllocTypeString); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + // Since VMaps are only created for new clones, we index with + // clone J-1 (J==0 is the original clone and does not have a VMaps + // entry). + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->addFnAttr(A); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) + << ore::NV("AllocationCall", CBClone) << " in clone " + << ore::NV("Caller", CBClone->getFunction()) + << " marked with memprof allocation attribute " + << ore::NV("Attribute", AllocTypeString)); + } + } else if (!CallsiteContext.empty()) { + // Consult the next callsite node. + assert(SI != FS->callsites().end()); + auto &StackNode = *(SI++); + +#ifndef NDEBUG + // Sanity check that the stack ids match between the summary and + // instruction metadata. + auto StackIdIndexIter = StackNode.StackIdIndices.begin(); + for (auto StackId : CallsiteContext) { + assert(StackIdIndexIter != StackNode.StackIdIndices.end()); + assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == + StackId); + StackIdIndexIter++; + } +#endif + + // Perform cloning if not yet done. + CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); + + // Should have skipped indirect calls via mayHaveMemprofSummary. + assert(CB->getCalledFunction()); + assert(!IsMemProfClone(*CB->getCalledFunction())); + + // Update the calls per the summary info. + // Save orig name since it gets updated in the first iteration + // below. + auto CalleeOrigName = CB->getCalledFunction()->getName(); + for (unsigned J = 0; J < StackNode.Clones.size(); J++) { + // Do nothing if this version calls the original version of its + // callee. + if (!StackNode.Clones[J]) + continue; + auto NewF = M.getOrInsertFunction( + getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), + CB->getCalledFunction()->getFunctionType()); + CallBase *CBClone; + // Copy 0 is the original function. + if (!J) + CBClone = CB; + else + CBClone = cast((*VMaps[J - 1])[CB]); + CBClone->setCalledFunction(NewF); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) + << ore::NV("Call", CBClone) << " in clone " + << ore::NV("Caller", CBClone->getFunction()) + << " assigned to call function clone " + << ore::NV("Callee", NewF.getCallee())); + } + } + // Memprof and callsite metadata on memory allocations no longer needed. + I.setMetadata(LLVMContext::MD_memprof, nullptr); + I.setMetadata(LLVMContext::MD_callsite, nullptr); + } + } + } + + return Changed; +} + template bool CallsiteContextGraph::process() { if (DumpCCG) { @@ -2820,12 +3198,46 @@ bool MemProfContextDisambiguation::processModule( Module &M, function_ref OREGetter) { - bool Changed = false; + + // If we have an import summary, then the cloning decisions were made during + // the thin link on the index. Apply them and return. + if (ImportSummary) + return applyImport(M); ModuleCallsiteContextGraph CCG(M, OREGetter); - Changed = CCG.process(); + return CCG.process(); +} - return Changed; +MemProfContextDisambiguation::MemProfContextDisambiguation( + const ModuleSummaryIndex *Summary) + : ImportSummary(Summary) { + if (ImportSummary) { + // The MemProfImportSummary should only be used for testing ThinLTO + // distributed backend handling via opt, in which case we don't have a + // summary from the pass pipeline. + assert(MemProfImportSummary.empty()); + return; + } + if (MemProfImportSummary.empty()) + return; + + auto ReadSummaryFile = + errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); + if (!ReadSummaryFile) { + logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), + "Error loading file '" + MemProfImportSummary + + "': "); + return; + } + auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); + if (!ImportSummaryForTestingOrErr) { + logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), + "Error parsing file '" + MemProfImportSummary + + "': "); + return; + } + ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); + ImportSummary = ImportSummaryForTesting.get(); } PreservedAnalyses MemProfContextDisambiguation::run(Module &M, diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -44,12 +44,14 @@ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + ;; Try again but with distributed ThinLTO ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ @@ -71,11 +73,18 @@ ;; Check distributed index ; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + source_filename = "memprof-basic.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !0 %call1 = call ptr @_Z3foov(), !callsite !1 @@ -86,7 +95,7 @@ declare i32 @sleep() -define internal ptr @_Z3barv() { +define internal ptr @_Z3barv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 ret ptr null @@ -94,13 +103,13 @@ declare ptr @_Znam(i64) -define internal ptr @_Z3bazv() { +define internal ptr @_Z3bazv() #0 { entry: %call = call ptr @_Z3barv(), !callsite !8 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call = call ptr @_Z3bazv(), !callsite !9 ret ptr null @@ -109,6 +118,8 @@ ; uselistorder directives uselistorder ptr @_Z3foov, { 1, 0 } +attributes #0 = { noinline optnone } + !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{!3, !5} @@ -252,9 +263,50 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3bazv.memprof.1 +; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1 + + +; IR: define {{.*}} @main +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend ; DOT: digraph "postbuild" { diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -66,13 +66,15 @@ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST ;; We should clone D once for the cold allocations via C. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + ;; Try again but with distributed ThinLTO ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ @@ -95,11 +97,18 @@ ;; Check distributed index ; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + source_filename = "duplicate-context-ids.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define internal ptr @_Z1Dv() { +define internal ptr @_Z1Dv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 ret ptr null @@ -107,31 +116,31 @@ declare ptr @_Znam(i64) -define internal ptr @_Z1Fv() { +define internal ptr @_Z1Fv() #0 { entry: %call = call ptr @_Z1Dv(), !callsite !6 ret ptr null } -define internal ptr @_Z1Cv() { +define internal ptr @_Z1Cv() #0 { entry: %call = call ptr @_Z1Dv(), !callsite !7 ret ptr null } -define internal ptr @_Z1Bv() { +define internal ptr @_Z1Bv() #0 { entry: %call.i = call ptr @_Z1Dv(), !callsite !8 ret ptr null } -define internal ptr @_Z1Ev() { +define internal ptr @_Z1Ev() #0 { entry: %call.i = call ptr @_Z1Dv(), !callsite !9 ret ptr null } -define i32 @main() { +define i32 @main() #0 { entry: call ptr @_Z1Bv() call ptr @_Z1Ev() @@ -143,6 +152,8 @@ declare i32 @sleep() +attributes #0 = { noinline optnone} + !0 = !{!1, !3} !1 = !{!2, !"cold"} !2 = !{i64 6541423618768552252, i64 -6270142974039008131} @@ -300,10 +311,43 @@ ; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4 ; DUMP: Clone of [[D]] +; REMARKS: created clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1 +; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1 + + +;; The allocation via F does not allocate cold memory. It should call the +;; original D, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: define internal {{.*}} @_Z1Dv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z1Fv() +; IR: call {{.*}} @_Z1Dv() +;; The allocations via B and E allocate cold memory. They should call the +;; cloned D, which ultimately call the cloned allocation decorated with a +;; "cold" attribute. +; IR: define internal {{.*}} @_Z1Bv() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Ev() +; IR: call {{.*}} @_Z1Dv.memprof.1() +; IR: define internal {{.*}} @_Z1Dv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend ; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend ; DOTPRE: digraph "prestackupdate" { diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll --- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll @@ -58,7 +58,9 @@ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -73,13 +75,20 @@ ; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ ; RUN: --check-prefix=STATS +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + source_filename = "funcassigncloning.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: noinline optnone -define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) { +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { entry: %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 @@ -107,7 +116,7 @@ } ; Function Attrs: noinline optnone -define i32 @main() { +define i32 @main() #0 { entry: call void @_Z1BPPcS0_() call void @_Z1CPPcS0_() @@ -122,6 +131,8 @@ ; uselistorder directives uselistorder ptr @_Znam, { 1, 0 } +attributes #0 = { noinline optnone } + !0 = !{!1, !3, !5} !1 = !{!2, !"cold"} !2 = !{i64 -3461278137325233666, i64 -7799663586031895603} @@ -230,6 +241,54 @@ ; DUMP: Clone of [[ENEW2ORIG]] +;; We greedily create a clone of E that is initially used by the clones of the +;; first call to new. However, we end up with an incompatible set of callers +;; given the second call to new which has clones with a different combination of +;; callers. Eventually, we create 2 more clones, and the first clone becomes dead. +; REMARKS: created clone _Z1EPPcS0_.memprof.1 +; REMARKS: created clone _Z1EPPcS0_.memprof.2 +; REMARKS: created clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold +; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3 +; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2 + + +;; Original version of E is used for the non-cold allocations, both from B. +; IR: define internal {{.*}} @_Z1EPPcS0_( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1BPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_( +;; C calls a clone of E with the first new allocating cold memory and the +;; second allocating non-cold memory. +; IR: define internal {{.*}} @_Z1CPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.3( +;; D calls a clone of E with the first new allocating non-cold memory and the +;; second allocating cold memory. +; IR: define internal {{.*}} @_Z1DPPcS0_( +; IR: call {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3( +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + ; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 8 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 4 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -66,13 +66,15 @@ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should only create a single clone of foo, for the direct call ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + ;; Try again but with distributed ThinLTO ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ @@ -94,6 +96,13 @@ ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS + source_filename = "indirectcall.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -101,12 +110,12 @@ @_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr @_ZTVN10__cxxabiv117__class_type_infoE = external global ptr -define internal ptr @_Z3barP1A(ptr %a) { +define internal ptr @_Z3barP1A(ptr %a) #0 { entry: ret ptr null } -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !0 %call1 = call ptr @_Z3foov(), !callsite !1 @@ -121,19 +130,19 @@ declare i32 @sleep() -define internal ptr @_ZN1A1xEv() { +define internal ptr @_ZN1A1xEv() #0 { entry: %call = call ptr @_Z3foov(), !callsite !6 ret ptr null } -define internal ptr @_ZN1B1xEv() { +define internal ptr @_ZN1B1xEv() #0 { entry: %call = call ptr @_Z3foov(), !callsite !7 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21 ret ptr null @@ -144,6 +153,8 @@ ; uselistorder directives uselistorder ptr @_Z3foov, { 3, 2, 1, 0 } +attributes #0 = { noinline optnone } + !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{i64 6792096022461663180} @@ -384,9 +395,39 @@ ; DUMP: Clone of [[FOO]] +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold + + +; IR: define {{.*}} @main( +; IR: call {{.*}} @_Z3foov() +;; Only the second call to foo, which allocates cold memory via direct calls, +;; is replaced with a call to a clone that calls a cold allocation. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: call {{.*}} @_Z3barP1A( +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend ; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend ; DOT: digraph "postbuild" { diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -54,13 +54,16 @@ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE \ +; RUN: --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should create clones for foo and bar for the call from main to allocate ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + ;; Try again but with distributed ThinLTO ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ @@ -80,11 +83,19 @@ ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \ +; RUN: --check-prefix=REMARKS + source_filename = "inlined.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define internal ptr @_Z3barv() { +define internal ptr @_Z3barv() #0 { entry: %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5 ret ptr null @@ -92,19 +103,19 @@ declare ptr @_Znam(i64) -define internal ptr @_Z3bazv() { +define internal ptr @_Z3bazv() #0 { entry: %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6 ret ptr null } -define internal ptr @_Z3foov() { +define internal ptr @_Z3foov() #0 { entry: %call.i = call ptr @_Z3barv(), !callsite !7 ret ptr null } -define i32 @main() { +define i32 @main() #0 { entry: %call = call ptr @_Z3foov(), !callsite !8 %call1 = call ptr @_Z3foov(), !callsite !9 @@ -115,6 +126,8 @@ declare i32 @sleep() +attributes #0 = { noinline optnone } + !0 = !{!1, !3} !1 = !{!2, !"notcold"} !2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} @@ -281,9 +294,50 @@ ; DUMP: Clone of [[BAR]] +; REMARKS: created clone _Z3barv.memprof.1 +; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3foov.memprof.1 +; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 + + +; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3barv() +; IR: define {{.*}} @main() +;; The first call to foo does not allocate cold memory. It should call the +;; original functions, which ultimately call the original allocation decorated +;; with a "notcold" attribute. +; IR: call {{.*}} @_Z3foov() +;; The second call to foo allocates cold memory. It should call cloned functions +;; which ultimately call a cloned allocation decorated with a "cold" attribute. +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + + ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) +; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-INPROCESS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 3 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend ; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS-BE: 2 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend +; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend +; STATS-INPROCESS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend +;; The distributed backend hasn't yet eliminated the now-dead baz with +;; the allocation from bar inlined, so it has one more allocation. +; STATS-DISTRIB-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend ; DOT: digraph "postbuild" {