diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -17,18 +17,12 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include namespace llvm { namespace memprof { -// Allocation type assigned to an allocation reached by a given context. -// More can be added but initially this is just noncold and cold. -// Values should be powers of two so that they can be ORed, in particular to -// track allocations that have different behavior with different calling -// contexts. -enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 }; - /// Return the allocation type for a given set of memory profile values. AllocationType getAllocType(uint64_t MaxAccessCount, uint64_t MinSize, uint64_t MinLifetime); @@ -106,6 +100,62 @@ bool buildAndAttachMIBMetadata(CallBase *CI); }; +/// Helper class to iterate through stack ids in both metadata (memprof MIB and +/// callsite) and the corresponding ThinLTO summary data structures +/// (CallsiteInfo and MIBInfo). This simplifies implementation of client code +/// which doesn't need to worry about whether we are operating with IR (Regular +/// LTO), or summary (ThinLTO). +template class CallStack { +public: + CallStack(const NodeT *N = nullptr) : N(N) {} + + // Implement minimum required methods for range-based for loop. + // The default implementation assumes we are operating on ThinLTO data + // structures, which have a vector of StackIdIndices. There are specialized + // versions provided to iterate through metadata. + struct CallStackIterator { + const NodeT *N = nullptr; + IteratorT Iter; + CallStackIterator(const NodeT *N, bool End) : N(N) { + if (!N) + return; + Iter = End ? N->StackIdIndices.end() : N->StackIdIndices.begin(); + } + uint64_t operator*() { + assert(Iter != N->StackIdIndices.end()); + return *Iter; + } + bool operator==(const CallStackIterator &rhs) { return Iter == rhs.Iter; } + bool operator!=(const CallStackIterator &rhs) { return !(*this == rhs); } + void operator++() { ++Iter; } + }; + + bool empty() const { return N == nullptr; } + + CallStackIterator begin() const { + return CallStackIterator(N, /*End*/ false); + } + CallStackIterator end() const { return CallStackIterator(N, /*End*/ true); } + + CallStackIterator beginAfterSharedPrefix(CallStack &Other) { + CallStackIterator Cur = begin(); + for (CallStackIterator OtherCur = Other.begin(); + Cur != end() && OtherCur != Other.end(); ++Cur, ++OtherCur) + assert(*Cur == *OtherCur); + return Cur; + } + +private: + const NodeT *N = nullptr; +}; + +/// Specializations for iterating through IR metadata stack contexts. +template <> +CallStack::CallStackIterator::CallStackIterator( + const MDNode *N, bool End); +template <> +uint64_t CallStack::CallStackIterator::operator*(); + } // end namespace memprof } // end namespace llvm diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -398,6 +398,10 @@ void addGlobalValueToIndex(std::string Name, GlobalValue::GUID, GlobalValue::LinkageTypes Linkage, unsigned ID, std::unique_ptr Summary); + bool parseOptionalAllocs(std::vector &Allocs); + bool parseMemProfs(std::vector &MIBs); + bool parseAllocType(uint8_t &AllocType); + bool parseOptionalCallsites(std::vector &Callsites); // Type Parsing. bool parseType(Type *&Result, const Twine &Msg, bool AllowVoid = false); diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -394,6 +394,15 @@ kw_byte, kw_bit, kw_varFlags, + // The following are used by MemProf summary info. + kw_callsites, + kw_clones, + kw_stackIds, + kw_allocs, + kw_versions, + kw_memProf, + kw_notcold, + kw_notcoldandcold, // GV's with __attribute__((no_sanitize("address"))), or things in // -fsanitize-ignorelist when built with ASan. diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h --- a/llvm/include/llvm/AsmParser/Parser.h +++ b/llvm/include/llvm/AsmParser/Parser.h @@ -105,6 +105,17 @@ std::unique_ptr parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err); +/// The function is a secondary interface to the LLVM Assembly Parser. It parses +/// an ASCII string that (presumably) contains LLVM Assembly code for a module +/// summary. It returns a a ModuleSummaryIndex with the corresponding features. +/// Note that this does not verify that the generated Index is valid, so you +/// should run the verifier after parsing the file to check that it is okay. +/// Parse LLVM Assembly from a string +/// \param AsmString The string containing assembly +/// \param Err Error result info. +std::unique_ptr +parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err); + /// parseAssemblyFile and parseAssemblyString are wrappers around this function. /// Parse LLVM Assembly from a MemoryBuffer. /// \param F The MemoryBuffer containing assembly diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h --- a/llvm/include/llvm/Bitcode/BitcodeReader.h +++ b/llvm/include/llvm/Bitcode/BitcodeReader.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Bitstream/BitCodeEnums.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" @@ -117,8 +118,11 @@ /// Parse the specified bitcode buffer and merge its module summary index /// into CombinedIndex. - Error readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, - uint64_t ModuleId); + Error readSummary( + ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, + uint64_t ModuleId, + std::function IsPrevailing = + [](GlobalValue::GUID) { return true; }); }; struct BitcodeFileContents { diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -301,6 +301,22 @@ // Range information for accessed offsets for every argument. // [n x (paramno, range, numcalls, numcalls x (callee_guid, paramno, range))] FS_PARAM_ACCESS = 25, + // Summary of per-module memprof callsite metadata. + // [valueid, n x stackidindex] + FS_PERMODULE_CALLSITE_INFO = 26, + // Summary of per-module allocation memprof metadata. + // [n x (alloc type, nummib, nummib x stackidindex)] + FS_PERMODULE_ALLOC_INFO = 27, + // Summary of combined index memprof callsite metadata. + // [valueid, numstackindices, numver, + // numstackindices x stackidindex, numver x version] + FS_COMBINED_CALLSITE_INFO = 28, + // Summary of combined index allocation memprof metadata. + // [nummib, numver, + // nummib x (alloc type, numstackids, numstackids x stackidindex), + // numver x version] + FS_COMBINED_ALLOC_INFO = 29, + FS_STACK_IDS = 30, }; enum MetadataCodes { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -19,6 +19,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -284,6 +285,79 @@ static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); } }; +/// Summary of memprof callsite metadata. +struct CallsiteInfo { + // Actual callee function. + ValueInfo Callee; + + // Used to record whole program analysis cloning decisions. + // The ThinLTO backend will need to create as many clones as there are entries + // in the vector (it is expected and should be confirmed that all such + // summaries in the same FunctionSummary have the same number of entries). + // Each index records version info for the corresponding clone of this + // function. The value is the callee clone it calls (becomes the appended + // suffix id). Index 0 is the original version, and a value of 0 calls the + // original callee. + SmallVector Clones{0}; + + // Represents stack ids in this context, recorded as indices into the + // StackIds vector in the summary index, which in turn holds the full 64-bit + // stack ids. This reduces memory as there are in practice far fewer unique + // stack ids than stack id references. + SmallVector StackIdIndices; + + CallsiteInfo(ValueInfo Callee, SmallVector StackIdIndices) + : Callee(Callee), StackIdIndices(std::move(StackIdIndices)) {} + CallsiteInfo(ValueInfo Callee, SmallVector Clones, + SmallVector StackIdIndices) + : Callee(Callee), Clones(std::move(Clones)), + StackIdIndices(std::move(StackIdIndices)) {} +}; + +// Allocation type assigned to an allocation reached by a given context. +// More can be added but initially this is just noncold and cold. +// Values should be powers of two so that they can be ORed, in particular to +// track allocations that have different behavior with different calling +// contexts. +enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 }; + +/// Summary of a single MIB in a memprof metadata on allocations. +struct MIBInfo { + // The allocation type for this profiled context. + AllocationType AllocType; + + // Represents stack ids in this context, recorded as indices into the + // StackIds vector in the summary index, which in turn holds the full 64-bit + // stack ids. This reduces memory as there are in practice far fewer unique + // stack ids than stack id references. + SmallVector StackIdIndices; + + MIBInfo(AllocationType AllocType, SmallVector StackIdIndices) + : AllocType(AllocType), StackIdIndices(std::move(StackIdIndices)) {} +}; + +/// Summary of memprof metadata on allocations. +struct AllocInfo { + // Used to record whole program analysis cloning decisions. + // The ThinLTO backend will need to create as many clones as there are entries + // in the vector (it is expected and should be confirmed that all such + // summaries in the same FunctionSummary have the same number of entries). + // Each index records version info for the corresponding clone of this + // function. The value is the allocation type of the corresponding allocation. + // Index 0 is the original version. Before cloning, index 0 may have more than + // one allocation type. + SmallVector Versions; + + // Vector of MIBs in this memprof metadata. + std::vector MIBs; + + AllocInfo(std::vector MIBs) : MIBs(std::move(MIBs)) { + Versions.push_back(0); + } + AllocInfo(SmallVector Versions, std::vector MIBs) + : Versions(std::move(Versions)), MIBs(std::move(MIBs)) {} +}; + /// Function and variable summary information to aid decisions and /// implementation of importing. class GlobalValueSummary { @@ -678,7 +752,8 @@ std::vector(), std::vector(), std::vector(), - std::vector()); + std::vector(), + std::vector(), std::vector()); } /// A dummy node to reference external functions that aren't in the index @@ -706,6 +781,25 @@ using ParamAccessesTy = std::vector; std::unique_ptr ParamAccesses; + /// Optional list of memprof callsite metadata summaries. The correspondence + /// between the callsite summary and the callsites in the function is implied + /// by the order in the vector (and can be validated by comparing the stack + /// ids in the CallsiteInfo to those in the instruction callsite metadata). + /// As a memory savings optimization, we only create these for the prevailing + /// copy of a symbol when creating the combined index during LTO. + using CallsitesTy = std::vector; + std::unique_ptr Callsites; + + /// Optional list of allocation memprof metadata summaries. The correspondence + /// between the alloc memprof summary and the allocation callsites in the + /// function is implied by the order in the vector (and can be validated by + /// comparing the stack ids in the AllocInfo to those in the instruction + /// memprof metadata). + /// As a memory savings optimization, we only create these for the prevailing + /// copy of a symbol when creating the combined index during LTO. + using AllocsTy = std::vector; + std::unique_ptr Allocs; + public: FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags, uint64_t EntryCount, std::vector Refs, @@ -715,7 +809,8 @@ std::vector TypeCheckedLoadVCalls, std::vector TypeTestAssumeConstVCalls, std::vector TypeCheckedLoadConstVCalls, - std::vector Params) + std::vector Params, CallsitesTy CallsiteList, + AllocsTy AllocList) : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)), InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount), CallGraphEdgeList(std::move(CGEdges)) { @@ -729,6 +824,10 @@ std::move(TypeCheckedLoadConstVCalls)}); if (!Params.empty()) ParamAccesses = std::make_unique(std::move(Params)); + if (!CallsiteList.empty()) + Callsites = std::make_unique(std::move(CallsiteList)); + if (!AllocList.empty()) + Allocs = std::make_unique(std::move(AllocList)); } // Gets the number of readonly and writeonly refs in RefEdgeList std::pair specialRefCounts() const; @@ -832,6 +931,18 @@ const TypeIdInfo *getTypeIdInfo() const { return TIdInfo.get(); }; + ArrayRef callsites() const { + if (Callsites) + return *Callsites; + return {}; + } + + ArrayRef allocs() const { + if (Allocs) + return *Allocs; + return {}; + } + friend struct GraphTraits; }; @@ -1163,6 +1274,16 @@ // the total number of basic blocks in the LTO unit in the combined index. uint64_t BlockCount; + // List of unique stack ids (hashes). We use a 4B index of the id in the + // stack id lists on the alloc and callsite summaries for memory savings, + // since the number of unique ids is in practice much smaller than the + // number of stack id references in the summaries. + std::vector StackIds; + + // Temporary map while building StackIds list. Clear when index is completely + // built via releaseTemporaryMemory. + std::map StackIdToIndex; + // YAML I/O support. friend yaml::MappingTraits; @@ -1205,6 +1326,31 @@ const_gvsummary_iterator end() const { return GlobalValueMap.end(); } size_t size() const { return GlobalValueMap.size(); } + const std::vector &stackIds() const { return StackIds; } + + unsigned addOrGetStackIdIndex(uint64_t StackId) { + auto Inserted = StackIdToIndex.insert({StackId, StackIds.size()}); + if (Inserted.second) + StackIds.push_back(StackId); + return Inserted.first->second; + } + + uint64_t getStackIdAtIndex(unsigned Index) const { + assert(StackIds.size() > Index); + return StackIds[Index]; + } + + // Facility to release memory from data structures only needed during index + // construction (including while building combined index). Currently this only + // releases the temporary map used while constructing a correspondence between + // stack ids and their index in the StackIds vector. Mostly impactful when + // building a large combined index. + void releaseTemporaryMemory() { + assert(StackIdToIndex.size() == StackIds.size()); + StackIdToIndex.clear(); + StackIds.shrink_to_fit(); + } + /// Convenience function for doing a DFS on a ValueInfo. Marks the function in /// the FunctionHasParent map. static void discoverNodes(ValueInfo V, diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -234,7 +234,8 @@ std::move(FSum.TypeCheckedLoadVCalls), std::move(FSum.TypeTestAssumeConstVCalls), std::move(FSum.TypeCheckedLoadConstVCalls), - ArrayRef{})); + ArrayRef{}, ArrayRef{}, + ArrayRef{})); } } static void output(IO &io, GlobalValueSummaryMapTy &V) { diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -224,3 +224,21 @@ CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes)); return true; } + +template <> +CallStack::CallStackIterator::CallStackIterator( + const MDNode *N, bool End) + : N(N) { + if (!N) + return; + Iter = End ? N->op_end() : N->op_begin(); +} + +template <> +uint64_t +CallStack::CallStackIterator::operator*() { + assert(Iter != N->op_end()); + ConstantInt *StackIdCInt = mdconst::dyn_extract(*Iter); + assert(StackIdCInt); + return StackIdCInt->getZExtValue(); +} diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/IndirectCallPromotionAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/TypeMetadataUtils.h" @@ -56,6 +57,7 @@ #include using namespace llvm; +using namespace llvm::memprof; #define DEBUG_TYPE "module-summary-analysis" @@ -275,6 +277,9 @@ std::vector NonVolatileLoads; std::vector NonVolatileStores; + std::vector Callsites; + std::vector Allocs; + bool HasInlineAsmMaybeReferencingInternal = false; bool HasIndirBranchToBlockAddress = false; bool HasUnknownCall = false; @@ -417,6 +422,57 @@ CallGraphEdges[Index.getOrInsertValueInfo(Candidate.Value)] .updateHotness(getHotness(Candidate.Count, PSI)); } + + // TODO: Skip indirect calls for now. Need to handle these better, likely + // by creating multiple Callsites, one per target, then speculatively + // devirtualize while applying clone info in the ThinLTO backends. This + // will also be important because we will have a different set of clone + // versions per target. This handling needs to match that in the ThinLTO + // backend so we handle things consistently for matching of callsite + // summaries to instructions. + if (!CalledFunction) + continue; + + // Compute the list of stack ids first (so we can trim them from the stack + // ids on any MIBs). + CallStack InstCallsite( + I.getMetadata(LLVMContext::MD_callsite)); + auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); + if (MemProfMD) { + std::vector MIBs; + for (auto &MDOp : MemProfMD->operands()) { + auto *MIBMD = cast(MDOp); + MDNode *StackNode = getMIBStackNode(MIBMD); + assert(StackNode); + SmallVector StackIdIndices; + CallStack StackContext(StackNode); + // Collapse out any on the allocation call (inlining). + for (auto ContextIter = + StackContext.beginAfterSharedPrefix(InstCallsite); + ContextIter != StackContext.end(); ++ContextIter) { + unsigned StackIdIdx = Index.addOrGetStackIdIndex(*ContextIter); + // If this is a direct recursion, simply skip the duplicate + // entries. If this is mutual recursion, handling is left to + // the LTO link analysis client. + if (StackIdIndices.empty() || StackIdIndices.back() != StackIdIdx) + StackIdIndices.push_back(StackIdIdx); + } + MIBs.push_back( + MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices))); + } + Allocs.push_back(AllocInfo(std::move(MIBs))); + } else if (!InstCallsite.empty()) { + SmallVector StackIdIndices; + for (auto StackId : InstCallsite) + StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId)); + // Use the original CalledValue, in case it was an alias. We want + // to record the call edge to the alias in that case. Eventually + // an alias summary will be created to associate the alias and + // aliasee. + auto CalleeValueInfo = + Index.getOrInsertValueInfo(cast(CalledValue)); + Callsites.push_back({CalleeValueInfo, StackIdIndices}); + } } } Index.addBlockCount(F.size()); @@ -509,7 +565,8 @@ CallGraphEdges.takeVector(), TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(), TypeTestAssumeConstVCalls.takeVector(), - TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses)); + TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses), + std::move(Callsites), std::move(Allocs)); if (NonRenamableLocal) CantBePromoted.insert(F.getGUID()); Index.addGlobalValueSummary(F, std::move(FuncSummary)); @@ -758,7 +815,8 @@ ArrayRef{}, ArrayRef{}, ArrayRef{}, - ArrayRef{}); + ArrayRef{}, + ArrayRef{}, ArrayRef{}); Index.addGlobalValueSummary(*GV, std::move(Summary)); } else { std::unique_ptr Summary = diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -763,6 +763,14 @@ KEYWORD(byte); KEYWORD(bit); KEYWORD(varFlags); + KEYWORD(callsites); + KEYWORD(clones); + KEYWORD(stackIds); + KEYWORD(allocs); + KEYWORD(versions); + KEYWORD(memProf); + KEYWORD(notcold); + KEYWORD(notcoldandcold); #undef KEYWORD diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -8534,6 +8534,8 @@ FunctionSummary::TypeIdInfo TypeIdInfo; std::vector ParamAccesses; std::vector Refs; + std::vector Callsites; + std::vector Allocs; // Default is all-zeros (conservative values). FunctionSummary::FFlags FFlags = {}; if (parseToken(lltok::colon, "expected ':' here") || @@ -8568,6 +8570,14 @@ if (parseOptionalParamAccesses(ParamAccesses)) return true; break; + case lltok::kw_allocs: + if (parseOptionalAllocs(Allocs)) + return true; + break; + case lltok::kw_callsites: + if (parseOptionalCallsites(Callsites)) + return true; + break; default: return error(Lex.getLoc(), "expected optional function summary field"); } @@ -8583,7 +8593,7 @@ std::move(TypeIdInfo.TypeCheckedLoadVCalls), std::move(TypeIdInfo.TypeTestAssumeConstVCalls), std::move(TypeIdInfo.TypeCheckedLoadConstVCalls), - std::move(ParamAccesses)); + std::move(ParamAccesses), std::move(Callsites), std::move(Allocs)); FS->setModulePath(ModulePath); @@ -9535,3 +9545,220 @@ VI.setWriteOnly(); return false; } + +/// OptionalAllocs +/// := 'allocs' ':' '(' Alloc [',' Alloc]* ')' +/// Alloc ::= '(' 'versions' ':' '(' Version [',' Version]* ')' +/// ',' MemProfs ')' +/// Version ::= UInt32 +bool LLParser::parseOptionalAllocs(std::vector &Allocs) { + assert(Lex.getKind() == lltok::kw_allocs); + Lex.Lex(); + + if (parseToken(lltok::colon, "expected ':' in allocs") || + parseToken(lltok::lparen, "expected '(' in allocs")) + return true; + + // parse each alloc + do { + if (parseToken(lltok::lparen, "expected '(' in alloc") || + parseToken(lltok::kw_versions, "expected 'versions' in alloc") || + parseToken(lltok::colon, "expected ':'") || + parseToken(lltok::lparen, "expected '(' in versions")) + return true; + + SmallVector Versions; + do { + uint8_t V = 0; + if (parseAllocType(V)) + return true; + Versions.push_back(V); + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in versions") || + parseToken(lltok::comma, "expected ',' in alloc")) + return true; + + std::vector MIBs; + if (parseMemProfs(MIBs)) + return true; + + Allocs.push_back({Versions, MIBs}); + + if (parseToken(lltok::rparen, "expected ')' in alloc")) + return true; + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in allocs")) + return true; + + return false; +} + +/// MemProfs +/// := 'memProf' ':' '(' MemProf [',' MemProf]* ')' +/// MemProf ::= '(' 'type' ':' AllocType +/// ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')' +/// StackId ::= UInt64 +bool LLParser::parseMemProfs(std::vector &MIBs) { + assert(Lex.getKind() == lltok::kw_memProf); + Lex.Lex(); + + if (parseToken(lltok::colon, "expected ':' in memprof") || + parseToken(lltok::lparen, "expected '(' in memprof")) + return true; + + // parse each MIB + do { + if (parseToken(lltok::lparen, "expected '(' in memprof") || + parseToken(lltok::kw_type, "expected 'type' in memprof") || + parseToken(lltok::colon, "expected ':'")) + return true; + + uint8_t AllocType; + if (parseAllocType(AllocType)) + return true; + + if (parseToken(lltok::comma, "expected ',' in memprof") || + parseToken(lltok::kw_stackIds, "expected 'stackIds' in memprof") || + parseToken(lltok::colon, "expected ':'") || + parseToken(lltok::lparen, "expected '(' in stackIds")) + return true; + + SmallVector StackIdIndices; + do { + uint64_t StackId = 0; + if (parseUInt64(StackId)) + return true; + StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId)); + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in stackIds")) + return true; + + MIBs.push_back({(AllocationType)AllocType, StackIdIndices}); + + if (parseToken(lltok::rparen, "expected ')' in memprof")) + return true; + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in memprof")) + return true; + + return false; +} + +/// AllocType +/// := ('none'|'notcold'|'cold'|'notcoldandcold') +bool LLParser::parseAllocType(uint8_t &AllocType) { + switch (Lex.getKind()) { + case lltok::kw_none: + AllocType = (uint8_t)AllocationType::None; + break; + case lltok::kw_notcold: + AllocType = (uint8_t)AllocationType::NotCold; + break; + case lltok::kw_cold: + AllocType = (uint8_t)AllocationType::Cold; + break; + case lltok::kw_notcoldandcold: + AllocType = + (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold; + break; + default: + return error(Lex.getLoc(), "invalid alloc type"); + } + Lex.Lex(); + return false; +} + +/// OptionalCallsites +/// := 'callsites' ':' '(' Callsite [',' Callsite]* ')' +/// Callsite ::= '(' 'callee' ':' GVReference +/// ',' 'clones' ':' '(' Version [',' Version]* ')' +/// ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')' +/// Version ::= UInt32 +/// StackId ::= UInt64 +bool LLParser::parseOptionalCallsites(std::vector &Callsites) { + assert(Lex.getKind() == lltok::kw_callsites); + Lex.Lex(); + + if (parseToken(lltok::colon, "expected ':' in callsites") || + parseToken(lltok::lparen, "expected '(' in callsites")) + return true; + + IdToIndexMapType IdToIndexMap; + // parse each callsite + do { + if (parseToken(lltok::lparen, "expected '(' in callsite") || + parseToken(lltok::kw_callee, "expected 'callee' in callsite") || + parseToken(lltok::colon, "expected ':'")) + return true; + + ValueInfo VI; + unsigned GVId = 0; + LocTy Loc = Lex.getLoc(); + if (!EatIfPresent(lltok::kw_null)) { + if (parseGVReference(VI, GVId)) + return true; + } + + if (parseToken(lltok::comma, "expected ',' in callsite") || + parseToken(lltok::kw_clones, "expected 'clones' in callsite") || + parseToken(lltok::colon, "expected ':'") || + parseToken(lltok::lparen, "expected '(' in clones")) + return true; + + SmallVector Clones; + do { + unsigned V = 0; + if (parseUInt32(V)) + return true; + Clones.push_back(V); + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in clones") || + parseToken(lltok::comma, "expected ',' in callsite") || + parseToken(lltok::kw_stackIds, "expected 'stackIds' in callsite") || + parseToken(lltok::colon, "expected ':'") || + parseToken(lltok::lparen, "expected '(' in stackIds")) + return true; + + SmallVector StackIdIndices; + do { + uint64_t StackId = 0; + if (parseUInt64(StackId)) + return true; + StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId)); + } while (EatIfPresent(lltok::comma)); + + if (parseToken(lltok::rparen, "expected ')' in stackIds")) + return true; + + // Keep track of the Callsites array index needing a forward reference. + // We will save the location of the ValueInfo needing an update, but + // can only do so once the SmallVector is finalized. + if (VI.getRef() == FwdVIRef) + IdToIndexMap[GVId].push_back(std::make_pair(Callsites.size(), Loc)); + Callsites.push_back({VI, Clones, StackIdIndices}); + + if (parseToken(lltok::rparen, "expected ')' in callsite")) + return true; + } while (EatIfPresent(lltok::comma)); + + // Now that the Callsites vector is finalized, it is safe to save the + // locations of any forward GV references that need updating later. + for (auto I : IdToIndexMap) { + auto &Infos = ForwardRefValueInfos[I.first]; + for (auto P : I.second) { + assert(Callsites[P.first].Callee.getRef() == FwdVIRef && + "Forward referenced ValueInfo expected to be empty"); + Infos.emplace_back(&Callsites[P.first].Callee, P.second); + } + } + + if (parseToken(lltok::rparen, "expected ')' in callsites")) + return true; + + return false; +} diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp --- a/llvm/lib/AsmParser/Parser.cpp +++ b/llvm/lib/AsmParser/Parser.cpp @@ -177,6 +177,12 @@ return parseSummaryIndexAssembly(FileOrErr.get()->getMemBufferRef(), Err); } +std::unique_ptr +llvm::parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err) { + MemoryBufferRef F(AsmString, ""); + return parseSummaryIndexAssembly(F, Err); +} + Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err, const Module &M, const SlotMapping *Slots) { SourceMgr SM; diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -315,6 +315,11 @@ STRINGIFY_CODE(FS, TYPE_ID_METADATA) STRINGIFY_CODE(FS, BLOCK_COUNT) STRINGIFY_CODE(FS, PARAM_ACCESS) + STRINGIFY_CODE(FS, PERMODULE_CALLSITE_INFO) + STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO) + STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO) + STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO) + STRINGIFY_CODE(FS, STACK_IDS) } case bitc::METADATA_ATTACHMENT_ID: switch (CodeID) { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -884,8 +884,10 @@ // they are recorded in the summary index being built. // We save a GUID which refers to the same global as the ValueInfo, but // ignoring the linkage, i.e. for values other than local linkage they are - // identical. - DenseMap> + // identical (this is the second tuple member). + // The third tuple member is the real GUID of the ValueInfo. + DenseMap> ValueIdToValueInfoMap; /// Map populated during module path string table parsing, from the @@ -905,10 +907,19 @@ /// this module by the client. unsigned ModuleId; + /// Callback to ask whether a symbol is the prevailing copy when invoked + /// during combined index building. + std::function IsPrevailing; + + /// Saves the stack ids from the STACK_IDS record to consult when adding stack + /// ids from the lists in the callsite and alloc entries to the index. + std::vector StackIds; + public: - ModuleSummaryIndexBitcodeReader(BitstreamCursor Stream, StringRef Strtab, - ModuleSummaryIndex &TheIndex, - StringRef ModulePath, unsigned ModuleId); + ModuleSummaryIndexBitcodeReader( + BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex, + StringRef ModulePath, unsigned ModuleId, + std::function IsPrevailing = nullptr); Error parseModule(); @@ -932,8 +943,12 @@ std::vector parseParamAccesses(ArrayRef Record); - std::tuple + std::tuple +#ifndef NDEBUG + getValueInfoFromValueId(unsigned ValueId, bool AllowNullValueInfo = false); +#else getValueInfoFromValueId(unsigned ValueId); +#endif void addThisModule(); ModuleSummaryIndex::ModuleInfo *getThisModule(); @@ -6593,9 +6608,10 @@ ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader( BitstreamCursor Cursor, StringRef Strtab, ModuleSummaryIndex &TheIndex, - StringRef ModulePath, unsigned ModuleId) + StringRef ModulePath, unsigned ModuleId, + std::function IsPrevailing) : BitcodeReaderBase(std::move(Cursor), Strtab), TheIndex(TheIndex), - ModulePath(ModulePath), ModuleId(ModuleId) {} + ModulePath(ModulePath), ModuleId(ModuleId), IsPrevailing(IsPrevailing) {} void ModuleSummaryIndexBitcodeReader::addThisModule() { TheIndex.addModule(ModulePath, ModuleId); @@ -6606,10 +6622,19 @@ return TheIndex.getModule(ModulePath); } -std::tuple +std::tuple +#ifndef NDEBUG +ModuleSummaryIndexBitcodeReader::getValueInfoFromValueId( + unsigned ValueId, bool AllowNullValueInfo) { +#else ModuleSummaryIndexBitcodeReader::getValueInfoFromValueId(unsigned ValueId) { +#endif auto VGI = ValueIdToValueInfoMap[ValueId]; - assert(std::get<0>(VGI)); + // This can happen in stack nodes in index files for distributed ThinLTO if + // the callee function summary is not included in the index. The bitcode + // writer records 0. Better way to deal with this so we don't have to disable + // this assert for all clients of this method? + assert(AllowNullValueInfo || std::get<0>(VGI)); return VGI; } @@ -6632,7 +6657,7 @@ ValueIdToValueInfoMap[ValueID] = std::make_tuple( TheIndex.getOrInsertValueInfo( ValueGUID, UseStrtab ? ValueName : TheIndex.saveString(ValueName)), - OriginalNameID); + OriginalNameID, ValueGUID); } // Specialized value symbol table parser used when reading module index @@ -6720,8 +6745,8 @@ GlobalValue::GUID RefGUID = Record[1]; // The "original name", which is the second value of the pair will be // overriden later by a FS_COMBINED_ORIGINAL_NAME in the combined index. - ValueIdToValueInfoMap[ValueID] = - std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID); + ValueIdToValueInfoMap[ValueID] = std::make_tuple( + TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID); break; } } @@ -7066,6 +7091,9 @@ PendingTypeCheckedLoadConstVCalls; std::vector PendingParamAccesses; + std::vector PendingCallsites; + std::vector PendingAllocs; + while (true) { Expected MaybeEntry = Stream.advanceSkippingSubblocks(); if (!MaybeEntry) @@ -7104,8 +7132,8 @@ case bitc::FS_VALUE_GUID: { // [valueid, refguid] uint64_t ValueID = Record[0]; GlobalValue::GUID RefGUID = Record[1]; - ValueIdToValueInfoMap[ValueID] = - std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID); + ValueIdToValueInfoMap[ValueID] = std::make_tuple( + TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID); break; } // FS_PERMODULE: [valueid, flags, instcount, fflags, numrefs, @@ -7157,6 +7185,13 @@ ArrayRef(Record).slice(CallGraphEdgeStartIndex), IsOldProfileFormat, HasProfile, HasRelBF); setSpecialRefs(Refs, NumRORefs, NumWORefs); + auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID); + // In order to save memory, only record the memprof summaries if this is + // the prevailing copy of a symbol. + if (IsPrevailing && !IsPrevailing(std::get<2>(VIAndOriginalGUID))) { + PendingCallsites.clear(); + PendingAllocs.clear(); + } auto FS = std::make_unique( Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0, std::move(Refs), std::move(Calls), std::move(PendingTypeTests), @@ -7164,8 +7199,8 @@ std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), std::move(PendingTypeCheckedLoadConstVCalls), - std::move(PendingParamAccesses)); - auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID); + std::move(PendingParamAccesses), std::move(PendingCallsites), + std::move(PendingAllocs)); FS->setModulePath(getThisModule()->first()); FS->setOriginalName(std::get<1>(VIAndOriginalGUID)); TheIndex.addGlobalValueSummary(std::get<0>(VIAndOriginalGUID), @@ -7308,7 +7343,8 @@ std::move(PendingTypeCheckedLoadVCalls), std::move(PendingTypeTestAssumeConstVCalls), std::move(PendingTypeCheckedLoadConstVCalls), - std::move(PendingParamAccesses)); + std::move(PendingParamAccesses), std::move(PendingCallsites), + std::move(PendingAllocs)); LastSeenSummary = FS.get(); LastSeenGUID = VI.getGUID(); FS->setModulePath(ModuleIdMap[ModuleId]); @@ -7434,6 +7470,96 @@ PendingParamAccesses = parseParamAccesses(Record); break; } + + case bitc::FS_STACK_IDS: { // [n x stackid] + // Save stack ids in the reader to consult when adding stack ids from the + // lists in the stack node and alloc node entries. + StackIds = ArrayRef(Record); + break; + } + + case bitc::FS_PERMODULE_CALLSITE_INFO: { + unsigned ValueID = Record[0]; + SmallVector StackIdList; + for (auto R = Record.begin() + 1; R != Record.end(); R++) { + assert(*R < StackIds.size()); + StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[*R])); + } + ValueInfo VI = std::get<0>(getValueInfoFromValueId(ValueID)); + PendingCallsites.push_back(CallsiteInfo({VI, std::move(StackIdList)})); + break; + } + + case bitc::FS_COMBINED_CALLSITE_INFO: { + unsigned ValueID = Record[0]; + unsigned I = 1; + SmallVector StackIdList; + SmallVector Versions; + unsigned NumStackIds = Record[I++]; + unsigned NumVersions = Record[I++]; + assert(Record.size() - I >= NumStackIds); + for (unsigned J = 0; J < NumStackIds; J++) { + assert(Record[I] < StackIds.size()); + StackIdList.push_back( + TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); + } + assert(Record.size() - I >= NumVersions); + for (unsigned J = 0; J < NumVersions; J++) + Versions.push_back(Record[I++]); + ValueInfo VI = std::get<0>( + getValueInfoFromValueId(ValueID, /*AllowNullValueInfo=*/true)); + PendingCallsites.push_back( + CallsiteInfo({VI, std::move(Versions), std::move(StackIdList)})); + break; + } + + case bitc::FS_PERMODULE_ALLOC_INFO: { + unsigned I = 0; + std::vector MIBs; + while (I < Record.size()) { + assert(Record.size() - I >= 2); + AllocationType AllocType = (AllocationType)Record[I++]; + unsigned NumStackEntries = Record[I++]; + assert(Record.size() - I >= NumStackEntries); + SmallVector StackIdList; + for (unsigned J = 0; J < NumStackEntries; J++) { + assert(Record[I] < StackIds.size()); + StackIdList.push_back( + TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); + } + MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); + } + PendingAllocs.push_back(AllocInfo(std::move(MIBs))); + break; + } + + case bitc::FS_COMBINED_ALLOC_INFO: { + unsigned I = 0; + std::vector MIBs; + unsigned NumMIBs = Record[I++]; + unsigned NumVersions = Record[I++]; + unsigned MIBsRead = 0; + while (MIBsRead++ < NumMIBs) { + assert(Record.size() - I >= 2); + AllocationType AllocType = (AllocationType)Record[I++]; + unsigned NumStackEntries = Record[I++]; + assert(Record.size() - I >= NumStackEntries); + SmallVector StackIdList; + for (unsigned J = 0; J < NumStackEntries; J++) { + assert(Record[I] < StackIds.size()); + StackIdList.push_back( + TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); + } + MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); + } + assert(Record.size() - I >= NumVersions); + SmallVector Versions; + for (unsigned J = 0; J < NumVersions; J++) + Versions.push_back(Record[I++]); + PendingAllocs.push_back( + AllocInfo(std::move(Versions), std::move(MIBs))); + break; + } } } llvm_unreachable("Exit infinite loop"); @@ -7753,14 +7879,15 @@ // We don't use ModuleIdentifier here because the client may need to control the // module path used in the combined summary (e.g. when reading summaries for // regular LTO modules). -Error BitcodeModule::readSummary(ModuleSummaryIndex &CombinedIndex, - StringRef ModulePath, uint64_t ModuleId) { +Error BitcodeModule::readSummary( + ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, uint64_t ModuleId, + std::function IsPrevailing) { BitstreamCursor Stream(Buffer); if (Error JumpFailed = Stream.JumpToBit(ModuleBit)) return JumpFailed; ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, CombinedIndex, - ModulePath, ModuleId); + ModulePath, ModuleId, IsPrevailing); return R.parseModule(); } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -211,12 +211,10 @@ void writePerModuleGlobalValueSummary(); private: - void writePerModuleFunctionSummaryRecord(SmallVector &NameVals, - GlobalValueSummary *Summary, - unsigned ValueID, - unsigned FSCallsAbbrev, - unsigned FSCallsProfileAbbrev, - const Function &F); + void writePerModuleFunctionSummaryRecord( + SmallVector &NameVals, GlobalValueSummary *Summary, + unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev, + unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F); void writeModuleLevelReferences(const GlobalVariable &V, SmallVector &NameVals, unsigned FSModRefsAbbrev, @@ -422,6 +420,11 @@ /// index and a value id generated by this class to use in references. std::map GUIDToValueIdMap; + // The sorted stack id indices actually used in the summary entries being + // written, which will be a subset of those in the full index in the case of + // distributed indexes. + std::vector StackIdIndices; + /// Tracks the last value id recorded in the GUIDToValueMap. unsigned GlobalValueId = 0; @@ -439,9 +442,28 @@ // in writing out the call graph edges. Save the mapping from GUID // to the new global value id to use when writing those edges, which // are currently saved in the index in terms of GUID. - forEachSummary([&](GVInfo I, bool) { + forEachSummary([&](GVInfo I, bool IsAliasee) { GUIDToValueIdMap[I.first] = ++GlobalValueId; + if (IsAliasee) + return; + auto *FS = dyn_cast(I.second); + if (!FS) + return; + // Record all stack id indices actually used in the summary entries being + // written, so that we can compact them in the case of distributed ThinLTO + // indexes. + for (auto &CI : FS->callsites()) + for (auto Idx : CI.StackIdIndices) + StackIdIndices.push_back(Idx); + for (auto &AI : FS->allocs()) + for (auto &MIB : AI.MIBs) + for (auto Idx : MIB.StackIdIndices) + StackIdIndices.push_back(Idx); }); + llvm::sort(StackIdIndices); + StackIdIndices.erase( + std::unique(StackIdIndices.begin(), StackIdIndices.end()), + StackIdIndices.end()); } /// The below iterator returns the GUID and associated summary. @@ -3881,11 +3903,64 @@ } } +static void writeFunctionHeapProfileRecords( + BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev, + unsigned AllocAbbrev, bool PerModule, + std::function GetValueID, + std::function GetStackIndex) { + SmallVector Record; + + for (auto &CI : FS->callsites()) { + Record.clear(); + // Per module callsite clones should always have a single entry of + // value 0. + assert(!PerModule || (CI.Clones.size() == 1 && CI.Clones[0] == 0)); + Record.push_back(GetValueID(CI.Callee)); + if (!PerModule) { + Record.push_back(CI.StackIdIndices.size()); + Record.push_back(CI.Clones.size()); + } + for (auto Id : CI.StackIdIndices) + Record.push_back(GetStackIndex(Id)); + if (!PerModule) { + for (auto V : CI.Clones) + Record.push_back(V); + } + Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_CALLSITE_INFO + : bitc::FS_COMBINED_CALLSITE_INFO, + Record, CallsiteAbbrev); + } + + for (auto &AI : FS->allocs()) { + Record.clear(); + // Per module alloc versions should always have a single entry of + // value 0. + assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0)); + if (!PerModule) { + Record.push_back(AI.MIBs.size()); + Record.push_back(AI.Versions.size()); + } + for (auto &MIB : AI.MIBs) { + Record.push_back((uint8_t)MIB.AllocType); + Record.push_back(MIB.StackIdIndices.size()); + for (auto Id : MIB.StackIdIndices) + Record.push_back(GetStackIndex(Id)); + } + if (!PerModule) { + for (auto V : AI.Versions) + Record.push_back(V); + } + Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO + : bitc::FS_COMBINED_ALLOC_INFO, + Record, AllocAbbrev); + } +} + // Helper to emit a single function summary record. void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev, - const Function &F) { + unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F) { NameVals.push_back(ValueID); FunctionSummary *FS = cast(Summary); @@ -3895,6 +3970,12 @@ return {VE.getValueID(VI.getValue())}; }); + writeFunctionHeapProfileRecords( + Stream, FS, CallsiteAbbrev, AllocAbbrev, + /*PerModule*/ true, + /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); }, + /*GetStackIndex*/ [&](unsigned I) { return I; }); + auto SpecialRefCnts = FS->specialRefCounts(); NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); NameVals.push_back(FS->instCount()); @@ -4006,6 +4087,16 @@ ArrayRef{GVI.second, GVI.first}); } + if (!Index->stackIds().empty()) { + auto StackIdAbbv = std::make_shared(); + StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS)); + // numids x stackid + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv)); + Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId); + } + // Abbrev for FS_PERMODULE_PROFILE. auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE)); @@ -4077,6 +4168,21 @@ Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned TypeIdCompatibleVtableAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_CALLSITE_INFO)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + // n x stackidindex + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO)); + // n x (alloc type, numstackids, numstackids x stackidindex) + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + SmallVector NameVals; // Iterate over the list of functions instead of the Index to // ensure the ordering is stable. @@ -4095,7 +4201,8 @@ } auto *Summary = VI.getSummaryList()[0].get(); writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F), - FSCallsAbbrev, FSCallsProfileAbbrev, F); + FSCallsAbbrev, FSCallsProfileAbbrev, + CallsiteAbbrev, AllocAbbrev, F); } // Capture references from GlobalVariable initializers, which are outside @@ -4137,7 +4244,7 @@ /// Emit the combined summary section into the combined index file. void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { - Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3); + Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 4); Stream.EmitRecord( bitc::FS_VERSION, ArrayRef{ModuleSummaryIndex::BitcodeSummaryVersion}); @@ -4150,6 +4257,21 @@ ArrayRef{GVI.second, GVI.first}); } + if (!StackIdIndices.empty()) { + auto StackIdAbbv = std::make_shared(); + StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS)); + // numids x stackid + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv)); + // Write the stack ids used by this index, which will be a subset of those in + // the full index in the case of distributed indexes. + std::vector StackIds; + for (auto &I : StackIdIndices) + StackIds.push_back(Index.getStackIdAtIndex(I)); + Stream.EmitRecord(bitc::FS_STACK_IDS, StackIds, StackIdAbbvId); + } + // Abbrev for FS_COMBINED. auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED)); @@ -4203,6 +4325,26 @@ Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_CALLSITE_INFO)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numstackindices + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver + // numstackindices x stackidindex, numver x version + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver + // nummib x (alloc type, numstackids, numstackids x stackidindex), + // numver x version + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + // The aliases are emitted as a post-pass, and will point to the value // id of the aliasee. Save them in a vector for post-processing. SmallVector Aliases; @@ -4279,6 +4421,8 @@ } auto GetValueId = [&](const ValueInfo &VI) -> Optional { + if (!VI) + return None; return getValueId(VI.getGUID()); }; @@ -4286,6 +4430,27 @@ writeFunctionTypeMetadataRecords(Stream, FS, GetValueId); getReferencedTypeIds(FS, ReferencedTypeIds); + writeFunctionHeapProfileRecords( + Stream, FS, CallsiteAbbrev, AllocAbbrev, + /*PerModule*/ false, + /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned { + Optional ValueID = GetValueId(VI); + // This can happen in shared index files for distributed ThinLTO if + // the callee function summary is not included. Record 0 which we + // will have to deal with conservatively when doing any kind of + // validation in the ThinLTO backends. + if (!ValueID) + return 0; + return *ValueID; + }, + /*GetStackIndex*/ [&](unsigned I) { + // Get the corresponding index into the list of StackIdIndices + // actually being written for this combined index (which may be a + // subset in the case of distributed indexes). + auto Lower = llvm::lower_bound(StackIdIndices, I); + return std::distance(StackIdIndices.begin(), Lower); + }); + NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(FS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3186,6 +3186,85 @@ if (const auto *TIdInfo = FS->getTypeIdInfo()) printTypeIdInfo(*TIdInfo); + // The AllocationType identifiers capture the profiled context behavior + // reaching a specific static allocation site (possibly cloned). Thus + // "notcoldandcold" implies there are multiple contexts which reach this site, + // some of which are cold and some of which are not, and that need to + // disambiguate via cloning or other context identification. + auto AllocTypeName = [](uint8_t Type) { + switch (Type) { + case (uint8_t)AllocationType::None: + return "none"; + break; + case (uint8_t)AllocationType::NotCold: + return "notcold"; + break; + case (uint8_t)AllocationType::Cold: + return "cold"; + break; + case (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold: + return "notcoldandcold"; + break; + default: + assert(false && "Unexpected alloc type"); + } + }; + + if (!FS->allocs().empty()) { + Out << ", allocs: ("; + FieldSeparator AFS; + for (auto &AI : FS->allocs()) { + Out << AFS; + Out << "(versions: ("; + FieldSeparator VFS; + for (auto V : AI.Versions) { + Out << VFS; + Out << AllocTypeName(V); + } + Out << "), memProf: ("; + FieldSeparator MIBFS; + for (auto &MIB : AI.MIBs) { + Out << MIBFS; + Out << "(type: " << AllocTypeName((uint8_t)MIB.AllocType); + Out << ", stackIds: ("; + FieldSeparator SIDFS; + for (auto Id : MIB.StackIdIndices) { + Out << SIDFS; + Out << TheIndex->getStackIdAtIndex(Id); + } + Out << "))"; + } + Out << "))"; + } + Out << ")"; + } + + if (!FS->callsites().empty()) { + Out << ", callsites: ("; + FieldSeparator SNFS; + for (auto &CI : FS->callsites()) { + Out << SNFS; + if (CI.Callee) + Out << "(callee: ^" << Machine.getGUIDSlot(CI.Callee.getGUID()); + else + Out << "(callee: null"; + Out << ", clones: ("; + FieldSeparator VFS; + for (auto V : CI.Clones) { + Out << VFS; + Out << V; + } + Out << "), stackIds: ("; + FieldSeparator SIDFS; + for (auto Id : CI.StackIdIndices) { + Out << SIDFS; + Out << TheIndex->getStackIdAtIndex(Id); + } + Out << "))"; + } + Out << ")"; + } + auto PrintRange = [&](const ConstantRange &Range) { Out << "[" << Range.getSignedMin() << ", " << Range.getSignedMax() << "]"; }; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -911,9 +911,25 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, const SymbolResolution *&ResI, const SymbolResolution *ResE) { + const SymbolResolution *ResITmp = ResI; + for (const InputFile::Symbol &Sym : Syms) { + assert(ResITmp != ResE); + SymbolResolution Res = *ResITmp++; + + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (Res.Prevailing) + ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); + } + } + if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(), - ThinLTO.ModuleMap.size())) + ThinLTO.ModuleMap.size(), [&](GlobalValue::GUID GUID) { + return ThinLTO.PrevailingModuleForGUID[GUID] == + BM.getModuleIdentifier(); + })) return Err; for (const InputFile::Symbol &Sym : Syms) { @@ -924,7 +940,8 @@ auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (Res.Prevailing) { - ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); + assert(ThinLTO.PrevailingModuleForGUID[GUID] == + BM.getModuleIdentifier()); // For linker redefined symbols (via --wrap or --defsym) we want to // switch the linkage to `weak` to prevent IPOs from happening. @@ -1454,6 +1471,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, const DenseSet &GUIDPreservedSymbols) { + ThinLTO.CombinedIndex.releaseTemporaryMemory(); timeTraceProfilerBegin("ThinLink", StringRef("")); auto TimeTraceScopeExit = llvm::make_scope_exit([]() { if (llvm::timeTraceProfilerEnabled()) diff --git a/llvm/test/Assembler/thinlto-memprof-summary.ll b/llvm/test/Assembler/thinlto-memprof-summary.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Assembler/thinlto-memprof-summary.ll @@ -0,0 +1,24 @@ +;; Test memprof summary parsing (tests all types/fields in various combinations). +; RUN: llvm-as %s -o - | llvm-dis -o - | FileCheck %s + +; ModuleID = 'thinlto-memprof-summary.thinlto.bc' + +^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049)) +;; Function with single alloc, multiple memprof MIBs, no versioning +^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789)))))))) +;; Function with callsite stack ids calling above function, no versioning +^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789)))))) +;; Function with multiple allocs, multiple memprof MIBs, multiple versions +^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789)))))))) +;; Function with callsite stack ids calling above function, multiple versions +^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789)))))) +;; Function with null callsite stack id (can happen in distributed indexes if callsite not imported) +^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414)))))) + +; Make sure we get back from llvm-dis what we put in via llvm-as. +; CHECK: ^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049)) +; CHECK: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789)))))))) +; CHECK: ^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789)))))) +; CHECK: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789)))))))) +; CHECK: ^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789)))))) +; CHECK: ^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414)))))) diff --git a/llvm/test/ThinLTO/X86/memprof-summary.ll b/llvm/test/ThinLTO/X86/memprof-summary.ll new file mode 100644 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-summary.ll @@ -0,0 +1,185 @@ +;; Check memprof summaries (per module, combined index, and distributed indexes) + +; RUN: split-file %s %t +; RUN: opt -module-summary %t/a.ll -o %ta.bc +; RUN: opt -module-summary %t/b.ll -o %tb.bc + +; RUN: llvm-dis -o - %ta.bc | FileCheck %s --check-prefix=PRELINKDISA +; PRELINKDISA: gv: (name: "main", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) ; guid = 15822663052811949562 + +; RUN: llvm-dis -o - %tb.bc | FileCheck %s --check-prefix=PRELINKDISB +; PRELINKDISB: gv: (name: "_Z3foov", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848)))))) ; guid = 9191153033785521275 +; PRELINKDISB: gv: (name: "_Z3bazv", {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000 +; PRELINKDISB: gv: (name: "_Z3barv", {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772 + +; RUN: llvm-bcanalyzer -dump %ta.bc | FileCheck %s --check-prefix=PRELINKBCANA +; PRELINKBCANA: + +; RUN: llvm-bcanalyzer -dump %tb.bc | FileCheck %s --check-prefix=PRELINKBCANB +; PRELINKBCANB: + +; RUN: llvm-lto2 run %ta.bc %tb.bc -o %t -save-temps \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%ta.bc,main,plx \ +; RUN: -r=%ta.bc,_Z3foov, \ +; RUN: -r=%ta.bc,free, \ +; RUN: -r=%ta.bc,sleep, \ +; RUN: -r=%tb.bc,_Z3foov,pl \ +; RUN: -r=%tb.bc,_Znam, \ +; RUN: -r=%tb.bc,_Z3barv,pl \ +; RUN: -r=%tb.bc,_Z3bazv,pl + +; RUN: llvm-dis -o - %t.index.bc | FileCheck %s --check-prefix=COMBINEDDIS +; COMBINEDDIS: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (2732490490862098848)))))) +; COMBINEDDIS: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^5, clones: (0), stackIds: (12481870273128938184)))))) +; COMBINEDDIS: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) +; COMBINEDDIS: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) + +; RUN: llvm-bcanalyzer -dump %t.index.bc | FileCheck %s --check-prefix=COMBINEDBCAN +; COMBINEDBCAN: + +; RUN: llvm-dis -o - %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISA +; DISTRIBUTEDDISA: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: null, clones: (0), stackIds: (2732490490862098848)))))) +; DISTRIBUTEDDISA: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) + +; RUN: llvm-dis -o - %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISB +; DISTRIBUTEDDISB: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848)))))) +; DISTRIBUTEDDISB: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184)))))) +; DISTRIBUTEDDISB: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) + +; RUN: llvm-bcanalyzer -dump %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANA +; DISTRIBUTEDBCANA: + +; RUN: llvm-bcanalyzer -dump %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANB +; DISTRIBUTEDBCANB: + +;--- a.ll +; ModuleID = 'a.cc' +source_filename = "a.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress norecurse uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !dbg !39 { +entry: + %call = call noundef ptr @_Z3foov(), !dbg !42, !callsite !43 + %call1 = call noundef ptr @_Z3foov(), !dbg !44, !callsite !45 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call, i8 0, i64 10, i1 false), !dbg !46 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call1, i8 0, i64 10, i1 false), !dbg !47 + call void @free(ptr noundef %call) #4, !dbg !48 + %call2 = call i32 @sleep(i32 noundef 10), !dbg !49 + call void @free(ptr noundef %call1) #4, !dbg !50 + ret i32 0, !dbg !51 +} + +declare !dbg !52 noundef ptr @_Z3foov() local_unnamed_addr #1 + +; Function Attrs: argmemonly mustprogress nocallback nofree nounwind willreturn writeonly +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 + +; Function Attrs: inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") +declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #3 + +declare !dbg !53 i32 @sleep(i32 noundef) local_unnamed_addr #1 + +attributes #0 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { argmemonly mustprogress nocallback nofree nounwind willreturn writeonly } +attributes #3 = { inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") "alloc-family"="malloc" "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "a.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ebabd56909271a1d4a7cac81c10624d5") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!39 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41) +!40 = !DISubroutineType(types: !41) +!41 = !{} +!42 = !DILocation(line: 6, column: 13, scope: !39) +!43 = !{i64 8632435727821051414} +!44 = !DILocation(line: 7, column: 13, scope: !39) +!45 = !{i64 -3421689549917153178} +!46 = !DILocation(line: 8, column: 3, scope: !39) +!47 = !DILocation(line: 9, column: 3, scope: !39) +!48 = !DILocation(line: 10, column: 3, scope: !39) +!49 = !DILocation(line: 11, column: 3, scope: !39) +!50 = !DILocation(line: 12, column: 3, scope: !39) +!51 = !DILocation(line: 13, column: 3, scope: !39) +!52 = !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41) +!53 = !DISubprogram(name: "sleep", scope: !54, file: !54, line: 453, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41) +!54 = !DIFile(filename: "include/unistd.h", directory: "/usr", checksumkind: CSK_MD5, checksum: "ee8f41a17f563f029d0e930ad871815a") + +;--- b.ll +; ModuleID = 'b.cc' +source_filename = "b.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress noinline uwtable +define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 !dbg !39 { +entry: + %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #2, !dbg !42, !memprof !43, !callsite !48 + ret ptr %call, !dbg !49 +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1 + +; Function Attrs: mustprogress noinline uwtable +define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #0 !dbg !50 { +entry: + %call = call noundef ptr @_Z3barv(), !dbg !51, !callsite !52 + ret ptr %call, !dbg !53 +} + +; Function Attrs: mustprogress uwtable +define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #3 !dbg !54 { +entry: + %call = call noundef ptr @_Z3bazv(), !dbg !55, !callsite !56 + ret ptr %call, !dbg !57 +} + +attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { builtin allocsize(0) } +attributes #3 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git@github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "b.cc", directory: ".", checksumkind: CSK_MD5, checksum: "335f81d275af57725cfc9ffc7be49bc2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!39 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !40, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41) +!40 = !DISubroutineType(types: !41) +!41 = !{} +!42 = !DILocation(line: 2, column: 10, scope: !39) +!43 = !{!44, !46} +!44 = !{!45, !"notcold"} +!45 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!46 = !{!47, !"cold"} +!47 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!48 = !{i64 9086428284934609951} +!49 = !DILocation(line: 2, column: 3, scope: !39) +!50 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41) +!51 = !DILocation(line: 6, column: 10, scope: !50) +!52 = !{i64 -5964873800580613432} +!53 = !DILocation(line: 6, column: 3, scope: !50) +!54 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !40, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41) +!55 = !DILocation(line: 10, column: 10, scope: !54) +!56 = !{i64 2732490490862098848} +!57 = !DILocation(line: 10, column: 3, scope: !54) diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp --- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp +++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp @@ -11,6 +11,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" @@ -34,6 +35,15 @@ return Mod; } + std::unique_ptr makeLLVMIndex(const char *Summary) { + SMDiagnostic Err; + std::unique_ptr Index = + parseSummaryIndexAssemblyString(Summary, Err); + if (!Index) + Err.print("MemoryProfileInfoTest", errs()); + return Index; + } + // This looks for a call that has the given value name, which // is the name of the value being assigned the call return value. CallBase *findCall(Function &F, const char *Name = nullptr) { @@ -359,4 +369,90 @@ } } +TEST_F(MemoryProfileInfoTest, CallStackTestIR) { + LLVMContext C; + std::unique_ptr M = makeLLVMModule(C, + R"IR( +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" +define ptr @test() { +entry: + %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10), !memprof !1, !callsite !6 + ret ptr %call +} +declare noundef nonnull ptr @_Znam(i64 noundef) +!1 = !{!2, !4} +!2 = !{!3, !"notcold"} +!3 = !{i64 1, i64 2, i64 3, i64 4} +!4 = !{!5, !"cold"} +!5 = !{i64 1, i64 2, i64 3, i64 5} +!6 = !{i64 1} +)IR"); + + Function *Func = M->getFunction("test"); + CallBase *Call = findCall(*Func, "call"); + + CallStack InstCallsite( + Call->getMetadata(LLVMContext::MD_callsite)); + + MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof); + bool First = true; + for (auto &MIBOp : MemProfMD->operands()) { + auto *MIBMD = cast(MIBOp); + MDNode *StackNode = getMIBStackNode(MIBMD); + CallStack StackContext(StackNode); + std::vector StackIds; + for (auto ContextIter = StackContext.beginAfterSharedPrefix(InstCallsite); + ContextIter != StackContext.end(); ++ContextIter) + StackIds.push_back(*ContextIter); + if (First) + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({2UL, 3UL, 4UL})); + else + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({2UL, 3UL, 5UL})); + First = false; + } +} + +TEST_F(MemoryProfileInfoTest, CallStackTestSummary) { + std::unique_ptr Index = makeLLVMIndex(R"Summary( +^0 = module: (path: "test.o", hash: (0, 0, 0, 0, 0)) +^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (1, 2, 3, 4)), (type: cold, stackIds: (1, 2, 3, 5)))))))) +^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (3, 4)), (callee: ^1, clones: (0), stackIds: (3, 5)))))) +)Summary"); + + ASSERT_NE(Index, nullptr); + auto *CallsiteSummary = + cast(Index->getGlobalValueSummary(/*guid=*/25)); + bool First = true; + for (auto &CI : CallsiteSummary->callsites()) { + CallStack::const_iterator> InstCallsite( + &CI); + std::vector StackIds; + for (auto StackIdIndex : InstCallsite) + StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex)); + if (First) + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({3UL, 4UL})); + else + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({3UL, 5UL})); + First = false; + } + + auto *AllocSummary = + cast(Index->getGlobalValueSummary(/*guid=*/23)); + for (auto &AI : AllocSummary->allocs()) { + bool First = true; + for (auto &MIB : AI.MIBs) { + CallStack::const_iterator> StackContext( + &MIB); + std::vector StackIds; + for (auto StackIdIndex : StackContext) + StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex)); + if (First) + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({1UL, 2UL, 3UL, 4UL})); + else + EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({1UL, 2UL, 3UL, 5UL})); + First = false; + } + } +} } // end anonymous namespace