diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -63,6 +63,10 @@ /// This is set to true if the cloned code contains a normal call instruction. bool ContainsCalls = false; + /// This is set to true if there is memprof related metadata (memprof or + /// callsite metadata) in the cloned code. + bool ContainsMemProfMetadata = false; + /// This is set to true if the cloned code contains a 'dynamic' alloca. /// Dynamic allocas are allocas that are either not in the entry block or they /// are in the entry block but are not a constant size. diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -315,12 +315,6 @@ return getAllocationData(V, MallocOrOpNewLike, TLI).has_value(); } -/// Tests if a value is a call or invoke to a library function that -/// allocates memory via new. -bool llvm::isNewLikeFn(const Value *V, const TargetLibraryInfo *TLI) { - return getAllocationData(V, OpNewLike, TLI).hasValue(); -} - /// Tests if a value is a call or invoke to a library function that /// allocates uninitialized memory with alignment (such as aligned_alloc). static bool isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1037,9 +1037,6 @@ // Read memprof data for the instrumented function from profile. bool readMemprof(IndexedInstrProfReader *PGOReader); - // Read memprof data for the instrumented function from profile. - bool readMemprof(IndexedInstrProfReader *PGOReader); - // Populate the counts for all BBs. void populateCounters(); diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -46,7 +46,7 @@ if (BB->hasName()) NewBB->setName(BB->getName() + NameSuffix); - bool hasCalls = false, hasDynamicAllocas = false; + bool hasCalls = false, hasDynamicAllocas = false, hasMemProfMetadata = false; Module *TheModule = F ? F->getParent() : nullptr; // Loop over all instructions, and copy them over. @@ -60,7 +60,10 @@ NewBB->getInstList().push_back(NewInst); VMap[&I] = NewInst; // Add instruction map to value. - hasCalls |= (isa(I) && !I.isDebugOrPseudoInst()); + if (isa(I) && !I.isDebugOrPseudoInst()) { + hasCalls = true; + hasMemProfMetadata |= I.hasMetadata(LLVMContext::MD_memprof); + } if (const AllocaInst *AI = dyn_cast(&I)) { if (!AI->isStaticAlloca()) { hasDynamicAllocas = true; @@ -70,6 +73,7 @@ if (CodeInfo) { CodeInfo->ContainsCalls |= hasCalls; + CodeInfo->ContainsMemProfMetadata |= hasMemProfMetadata; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; } return NewBB; @@ -471,6 +475,7 @@ } bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; + bool hasMemProfMetadata = false; // Loop over all instructions, and copy them over, DCE'ing as we go. This // loop doesn't include the terminator. @@ -515,7 +520,10 @@ NewInst->setName(II->getName() + NameSuffix); VMap[&*II] = NewInst; // Add instruction map to value. NewBB->getInstList().push_back(NewInst); - hasCalls |= (isa(II) && !II->isDebugOrPseudoInst()); + if (isa(II) && !II->isDebugOrPseudoInst()) { + hasCalls = true; + hasMemProfMetadata |= II->hasMetadata(LLVMContext::MD_memprof); + } if (CodeInfo) { CodeInfo->OrigVMap[&*II] = NewInst; @@ -589,6 +597,7 @@ if (CodeInfo) { CodeInfo->ContainsCalls |= hasCalls; + CodeInfo->ContainsMemProfMetadata |= hasMemProfMetadata; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/Analysis/ObjCARCAnalysisUtils.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -74,7 +75,10 @@ #include #include +#define DEBUG_TYPE "inline-function" + using namespace llvm; +using namespace llvm::memprof; using ProfileCount = Function::ProfileCount; static cl::opt @@ -782,6 +786,174 @@ UnwindDest->removePredecessor(InvokeBB); } +static bool haveCommonPrefix(MDNode *MIBStackContext, + MDNode *CallsiteStackContext) { + assert(MIBStackContext->getNumOperands() > 0 && + CallsiteStackContext->getNumOperands() > 0); + // Because of the context trimming performed during matching, the callsite + // context could have more stack ids than the MIB. We match up to the end of + // the shortest stack context. + for (auto MIBStackIter = MIBStackContext->op_begin(), + CallsiteStackIter = CallsiteStackContext->op_begin(); + MIBStackIter != MIBStackContext->op_end() && + CallsiteStackIter != CallsiteStackContext->op_end(); + MIBStackIter++, CallsiteStackIter++) { + auto *Val1 = mdconst::dyn_extract(*MIBStackIter); + auto *Val2 = mdconst::dyn_extract(*CallsiteStackIter); + assert(Val1 && Val2); + if (Val1->getZExtValue() != Val2->getZExtValue()) + return false; + } + return true; +} + +static void removeMemProfMetadata(CallBase *Call) { + Call->setMetadata(LLVMContext::MD_memprof, nullptr); +} + +static void removeCallsiteMetadata(CallBase *Call) { + Call->setMetadata(LLVMContext::MD_callsite, nullptr); +} + +static void updateMemprofMetadata(CallBase *CI, + const std::vector &MIBList) { + assert(!MIBList.empty()); + // Remove existing memprof, which will either be replaced or may not be needed + // if we are able to use a single allocation type function attribute. + removeMemProfMetadata(CI); + CallStackTrie CallStack; + for (Metadata *MIB : MIBList) + CallStack.addCallStack(cast(MIB)); + bool MemprofMDAttached = CallStack.buildAndAttachMIBMetadata(CI); + assert(MemprofMDAttached == CI->hasMetadata(LLVMContext::MD_memprof)); + if (!MemprofMDAttached) + // If we used a function attribute remove the callsite metadata as well. + removeCallsiteMetadata(CI); +} + +// Update the metadata on the inlined copy ClonedCall of a call OrigCall in the +// inlined callee body, based on the callsite metadata InlinedCallsiteMD from +// the call that was inlined. +static void +propagateMemProfHelper(const CallBase *OrigCall, CallBase *ClonedCall, + MDNode *InlinedCallsiteMD, + std::map> + &OrigCallToNewMemProfMDMap) { + MDNode *OrigCallsiteMD = ClonedCall->getMetadata(LLVMContext::MD_callsite); + MDNode *ClonedCallsiteMD = nullptr; + // Check if the call originally had callsite metadata, and update it for the + // new call in the inlined body. + if (OrigCallsiteMD) { + // The cloned call's context is now the concatenation of the original call's + // callsite metadata and the callsite metadata on the call where it was + // inlined. + ClonedCallsiteMD = MDNode::concatenate(OrigCallsiteMD, InlinedCallsiteMD); + ClonedCall->setMetadata(LLVMContext::MD_callsite, ClonedCallsiteMD); + } + + // Update any memprof metadata on the cloned call. + MDNode *OrigMemProfMD = ClonedCall->getMetadata(LLVMContext::MD_memprof); + if (!OrigMemProfMD) + return; + // We currently expect that allocations with memprof metadata also have + // callsite metadata for the allocation's part of the context. + assert(OrigCallsiteMD); + + // New call's MIB list. + std::vector NewMIBList; + // Updated MIB list for the original call in the out-of-line callee. + std::vector UpdatedOrigMIBList; + + // For each MIB metadata, check if its call stack context starts with the + // new clone's callsite metadata. If so, that MIB goes onto the cloned call in + // the inlined body. If not, it stays on the out-of-line original call. + for (auto &MIBOp : OrigMemProfMD->operands()) { + MDNode *MIB = dyn_cast(MIBOp); + // Stack is first operand of MIB. + MDNode *StackMD = getMIBStackNode(MIB); + assert(StackMD); + // See if the new cloned callsite context matches this profiled context. + if (haveCommonPrefix(StackMD, ClonedCallsiteMD)) + // Add it to the cloned call's MIB list. + NewMIBList.push_back(MIB); + else + // Keep it on the original call. + UpdatedOrigMIBList.push_back(MIB); + } + if (NewMIBList.empty()) { + removeMemProfMetadata(ClonedCall); + removeCallsiteMetadata(ClonedCall); + return; + } + if (NewMIBList.size() < OrigMemProfMD->getNumOperands()) { + assert(!UpdatedOrigMIBList.empty()); + OrigCallToNewMemProfMDMap[OrigCall] = UpdatedOrigMIBList; + updateMemprofMetadata(ClonedCall, NewMIBList); + } else + OrigCallToNewMemProfMDMap[OrigCall] = {}; +} + +// Update memprof related metadata (!memprof and !callsite) based on the +// inlining of Callee into the callsite at CB. The updates include merging the +// inlined callee's callsite metadata with that of the inlined call, +// and moving the subset of any memprof contexts to the inlined callee +// allocations if they match the new inlined call stack. +// FIXME: Replace memprof metadata with function attribute if all MIB end up +// having the same behavior. Do other context trimming/merging optimizations +// too. +static void +propagateMemProfMetadata(Function *Callee, CallBase &CB, + bool ContainsMemProfMetadata, + const ValueMap &VMap) { + MDNode *CallsiteMD = CB.getMetadata(LLVMContext::MD_callsite); + // Only need to update if the inlined callsite had callsite metadata, or if + // there was any memprof metadata inlined. + if (!CallsiteMD && !ContainsMemProfMetadata) + return; + + // Propagate metadata onto the cloned calls in the inlined callee. + // Can't update the original call using the VMap since it holds a const + // pointer, those will be updated in the subsequent loop. + std::map> OrigCallToNewMemProfMDMap; + for (const auto &Entry : VMap) { + // See if this is a call that has been inlined and remapped, and not + // simplified away in the process. + auto *OrigCall = dyn_cast_or_null(Entry.first); + auto *ClonedCall = dyn_cast_or_null(Entry.second); + if (!OrigCall || !ClonedCall) + continue; + // If the inlined callsite did not have any callsite metadata, then it isn't + // involved in any profiled call contexts, and we can remove any memprof + // metadata on the cloned call. + if (!CallsiteMD) { + removeMemProfMetadata(ClonedCall); + removeCallsiteMetadata(ClonedCall); + continue; + } + propagateMemProfHelper(OrigCall, ClonedCall, CallsiteMD, + OrigCallToNewMemProfMDMap); + } + + // Update memprof MD on calls within the original callee function to remove + // MIB with stacks that matched the inlined context (those moved to a new + // memprof MD on the inlined version of the call). + for (BasicBlock &BB : *Callee) { + for (Instruction &I : BB) { + CallBase *Call = dyn_cast(&I); + if (!Call || !OrigCallToNewMemProfMDMap.count(Call)) + continue; + std::vector &UpdatedMemProfMD = + OrigCallToNewMemProfMDMap[Call]; + if (!UpdatedMemProfMD.empty()) + updateMemprofMetadata(Call, UpdatedMemProfMD); + else { + removeMemProfMetadata(Call); + removeCallsiteMetadata(Call); + } + } + } +} + /// When inlining a call site that has !llvm.mem.parallel_loop_access, /// !llvm.access.group, !alias.scope or !noalias metadata, that metadata should /// be propagated to all memory-accessing cloned instructions. @@ -2091,6 +2263,9 @@ // function which feed into its return value. AddReturnAttributes(CB, VMap); + propagateMemProfMetadata(CalledFunc, CB, + InlinedFunctionInfo.ContainsMemProfMetadata, VMap); + // Propagate metadata on the callsite if necessary. PropagateCallSiteMetadata(CB, FirstNewBlock, Caller->end()); diff --git a/llvm/test/Transforms/Inline/memprof_inline.ll b/llvm/test/Transforms/Inline/memprof_inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/memprof_inline.ll @@ -0,0 +1,228 @@ +;; Test for memprof metadata propagation, ensuring metadata is simplified +;; to function attributes appropriately after inlining profiled call chains. +;; +;; The following code was used to generate the following IR and its memprof +;; profile: +;; +;; #include +;; #include +;; #include +;; char *foo() { +;; return new char[10]; +;; } +;; char *foo2() { +;; return foo(); +;; } +;; int main(int argc, char **argv) { +;; char *c = foo(); +;; char *d = foo(); +;; char *e = foo2(); +;; memset(c, 0, 10); +;; memset(d, 0, 10); +;; memset(e, 0, 10); +;; delete[] c; +;; sleep(200); +;; delete[] d; +;; delete[] e; +;; return 0; +;; } + + +; RUN: opt -inline %s -S | FileCheck %s + +; ModuleID = 'memprof_inline.cc' +source_filename = "memprof_inline.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef i8* @_Z3foov +define dso_local noundef i8* @_Z3foov() #0 !dbg !39 { +entry: + ; CHECK: call {{.*}} @_Znam + ; CHECK-NOT: !memprof + ; CHECK-NOT: !callsite + %call = call noalias noundef nonnull i8* @_Znam(i64 noundef 10) #6, !dbg !42, !memprof !43, !callsite !50 + ; CHECK-NEXT: ret + ret i8* %call, !dbg !51 +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull i8* @_Znam(i64 noundef) #1 + +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef i8* @_Z4foo2v +define dso_local noundef i8* @_Z4foo2v() #0 !dbg !52 { +entry: + ; CHECK: call {{.*}} @_Znam{{.*}} #[[COLD:[0-9]+]] + %call = call noundef i8* @_Z3foov(), !dbg !53, !callsite !54 + ret i8* %call, !dbg !55 +} + +; Function Attrs: mustprogress norecurse uwtable +; CHECK-LABEL: define dso_local noundef i32 @main +define dso_local noundef i32 @main(i32 noundef %argc, i8** noundef %argv) #2 !dbg !56 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %c = alloca i8*, align 8 + %d = alloca i8*, align 8 + %e = alloca i8*, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + ; CHECK: call {{.*}} @_Znam{{.*}} #[[NOTCOLD:[0-9]+]] + %call = call noundef i8* @_Z3foov(), !dbg !57, !callsite !58 + store i8* %call, i8** %c, align 8, !dbg !59 + ; CHECK: call {{.*}} @_Znam{{.*}} #[[COLD]] + %call1 = call noundef i8* @_Z3foov(), !dbg !60, !callsite !61 + store i8* %call1, i8** %d, align 8, !dbg !62 + ; CHECK: call {{.*}} @_Znam{{.*}} #[[COLD]] + %call2 = call noundef i8* @_Z4foo2v(), !dbg !63, !callsite !64 + store i8* %call2, i8** %e, align 8, !dbg !65 + %0 = load i8*, i8** %c, align 8, !dbg !66 + call void @llvm.memset.p0i8.i64(i8* align 1 %0, i8 0, i64 10, i1 false), !dbg !67 + %1 = load i8*, i8** %d, align 8, !dbg !68 + call void @llvm.memset.p0i8.i64(i8* align 1 %1, i8 0, i64 10, i1 false), !dbg !69 + %2 = load i8*, i8** %e, align 8, !dbg !70 + call void @llvm.memset.p0i8.i64(i8* align 1 %2, i8 0, i64 10, i1 false), !dbg !71 + %3 = load i8*, i8** %c, align 8, !dbg !72 + %isnull = icmp eq i8* %3, null, !dbg !73 + br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !73 + +delete.notnull: ; preds = %entry + call void @_ZdaPv(i8* noundef %3) #7, !dbg !74 + br label %delete.end, !dbg !74 + +delete.end: ; preds = %delete.notnull, %entry + %call4 = call i32 @sleep(i32 noundef 200), !dbg !76 + %4 = load i8*, i8** %d, align 8, !dbg !77 + %isnull5 = icmp eq i8* %4, null, !dbg !78 + br i1 %isnull5, label %delete.end7, label %delete.notnull6, !dbg !78 + +delete.notnull6: ; preds = %delete.end + call void @_ZdaPv(i8* noundef %4) #7, !dbg !79 + br label %delete.end7, !dbg !79 + +delete.end7: ; preds = %delete.notnull6, %delete.end + %5 = load i8*, i8** %e, align 8, !dbg !80 + %isnull8 = icmp eq i8* %5, null, !dbg !81 + br i1 %isnull8, label %delete.end10, label %delete.notnull9, !dbg !81 + +delete.notnull9: ; preds = %delete.end7 + call void @_ZdaPv(i8* noundef %5) #7, !dbg !82 + br label %delete.end10, !dbg !82 + +delete.end10: ; preds = %delete.notnull9, %delete.end7 + ret i32 0, !dbg !83 +} + +; Function Attrs: argmemonly nofree nounwind willreturn writeonly +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #3 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(i8* noundef) #4 + +declare i32 @sleep(i32 noundef) #5 + +; CHECK: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" } +; CHECK: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" } + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { argmemonly nofree nounwind willreturn writeonly } +attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { builtin allocsize(0) } +attributes #7 = { builtin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8, !9} +!llvm.ident = !{!38} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof_inline.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "8711f6fd269e6cb5611fef48bc906eab") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ProfileSummary", !10} +!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20} +!11 = !{!"ProfileFormat", !"InstrProf"} +!12 = !{!"TotalCount", i64 0} +!13 = !{!"MaxCount", i64 0} +!14 = !{!"MaxInternalCount", i64 0} +!15 = !{!"MaxFunctionCount", i64 0} +!16 = !{!"NumCounts", i64 0} +!17 = !{!"NumFunctions", i64 0} +!18 = !{!"IsPartialProfile", i64 0} +!19 = !{!"PartialProfileRatio", double 0.000000e+00} +!20 = !{!"DetailedSummary", !21} +!21 = !{!22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37} +!22 = !{i32 10000, i64 0, i32 0} +!23 = !{i32 100000, i64 0, i32 0} +!24 = !{i32 200000, i64 0, i32 0} +!25 = !{i32 300000, i64 0, i32 0} +!26 = !{i32 400000, i64 0, i32 0} +!27 = !{i32 500000, i64 0, i32 0} +!28 = !{i32 600000, i64 0, i32 0} +!29 = !{i32 700000, i64 0, i32 0} +!30 = !{i32 800000, i64 0, i32 0} +!31 = !{i32 900000, i64 0, i32 0} +!32 = !{i32 950000, i64 0, i32 0} +!33 = !{i32 990000, i64 0, i32 0} +!34 = !{i32 999000, i64 0, i32 0} +!35 = !{i32 999900, i64 0, i32 0} +!36 = !{i32 999990, i64 0, i32 0} +!37 = !{i32 999999, i64 0, i32 0} +!38 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)"} +!39 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!40 = !DISubroutineType(types: !41) +!41 = !{} +!42 = !DILocation(line: 5, column: 10, scope: !39) +!43 = !{!44, !46, !48} +!44 = !{!45, !"cold"} +!45 = !{i64 -2458008693472584243, i64 7394638144382192936} +!46 = !{!47, !"noncold"} +!47 = !{i64 -2458008693472584243, i64 -8908997186479157179} +!48 = !{!49, !"cold"} +!49 = !{i64 -2458008693472584243, i64 -8079659623765193173} +!50 = !{i64 -2458008693472584243} +!51 = !DILocation(line: 5, column: 3, scope: !39) +!52 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !40, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!53 = !DILocation(line: 8, column: 10, scope: !52) +!54 = !{i64 -8079659623765193173} +!55 = !DILocation(line: 8, column: 3, scope: !52) +!56 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 16, type: !40, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!57 = !DILocation(line: 17, column: 13, scope: !56) +!58 = !{i64 -8908997186479157179} +!59 = !DILocation(line: 17, column: 9, scope: !56) +!60 = !DILocation(line: 18, column: 13, scope: !56) +!61 = !{i64 7394638144382192936} +!62 = !DILocation(line: 18, column: 9, scope: !56) +!63 = !DILocation(line: 19, column: 13, scope: !56) +!64 = !{i64 -5510257407004945023} +!65 = !DILocation(line: 19, column: 9, scope: !56) +!66 = !DILocation(line: 21, column: 10, scope: !56) +!67 = !DILocation(line: 21, column: 3, scope: !56) +!68 = !DILocation(line: 22, column: 10, scope: !56) +!69 = !DILocation(line: 22, column: 3, scope: !56) +!70 = !DILocation(line: 23, column: 10, scope: !56) +!71 = !DILocation(line: 23, column: 3, scope: !56) +!72 = !DILocation(line: 25, column: 12, scope: !56) +!73 = !DILocation(line: 25, column: 3, scope: !56) +!74 = !DILocation(line: 25, column: 3, scope: !75) +!75 = !DILexicalBlockFile(scope: !56, file: !1, discriminator: 2) +!76 = !DILocation(line: 26, column: 3, scope: !56) +!77 = !DILocation(line: 27, column: 12, scope: !56) +!78 = !DILocation(line: 27, column: 3, scope: !56) +!79 = !DILocation(line: 27, column: 3, scope: !75) +!80 = !DILocation(line: 28, column: 12, scope: !56) +!81 = !DILocation(line: 28, column: 3, scope: !56) +!82 = !DILocation(line: 28, column: 3, scope: !75) +!83 = !DILocation(line: 30, column: 3, scope: !56) diff --git a/llvm/test/Transforms/Inline/memprof_inline2.ll b/llvm/test/Transforms/Inline/memprof_inline2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/memprof_inline2.ll @@ -0,0 +1,316 @@ +;; Test for memprof metadata propagation, ensuring metadata is moved to +;; inlined callsites. +;; Also check that callsite metadata was updated with inlined stack ids. +;; +;; The following code was used to generate the following IR and its memprof +;; profile: +;; +;; #include +;; #include +;; #include +;; char *foo() { +;; return new char[10]; +;; } +;; char *foo2() __attribute((noinline)) { +;; return foo(); +;; } +;; char *bar() { +;; return foo2(); +;; } +;; char *baz() { +;; return foo2(); +;; } +;; int main(int argc, char **argv) { +;; char *c = foo(); +;; char *d = foo(); +;; char *e = bar(); +;; char *f = baz(); +;; memset(c, 0, 10); +;; memset(d, 0, 10); +;; memset(e, 0, 10); +;; memset(f, 0, 10); +;; delete[] c; +;; sleep(200); +;; delete[] d; +;; delete[] e; +;; delete[] f; +;; return 0; +;; } + +; RUN: opt -inline %s -S | FileCheck %s + +; ModuleID = 'memprof_inline2.cc' +source_filename = "memprof_inline2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef i8* @_Z3foov +define dso_local noundef i8* @_Z3foov() #0 !dbg !39 { +entry: + ;; We should still have memprof/callsite metadata for the non-inlined calls + ;; from main, but should have removed those from the inlined call in_Z4foo2v. + ;; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[ORIGMEMPROF:[0-9]+]] + %call = call noalias noundef nonnull i8* @_Znam(i64 noundef 10) #7, !dbg !42, !memprof !43, !callsite !52 + ret i8* %call, !dbg !53 +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull i8* @_Znam(i64 noundef) #1 + +;; Mark noinline so we don't inline into calls from bar and baz. We should end +;; up with a memprof metadata on the call to foo below. +; Function Attrs: mustprogress noinline uwtable +; CHECK-LABEL: define dso_local noundef i8* @_Z4foo2v +define dso_local noundef i8* @_Z4foo2v() #2 !dbg !54 { +entry: + ;; We should have memprof metadata for the call stacks from bar and baz, + ;; and the callsite metadata should be the concatentation of the id from the + ;; inlined call to new and the original callsite. + ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[NEWMEMPROF:[0-9]+]], !callsite ![[NEWCALLSITE:[0-9]+]] + %call = call noundef i8* @_Z3foov(), !dbg !55, !callsite !56 + ret i8* %call, !dbg !57 +} + +; Function Attrs: mustprogress uwtable +define dso_local noundef i8* @_Z3barv() #0 !dbg !58 { +entry: + %call = call noundef i8* @_Z4foo2v(), !dbg !59, !callsite !60 + ret i8* %call, !dbg !61 +} + +; Function Attrs: mustprogress uwtable +define dso_local noundef i8* @_Z3bazv() #0 !dbg !62 { +entry: + %call = call noundef i8* @_Z4foo2v(), !dbg !63, !callsite !64 + ret i8* %call, !dbg !65 +} + +;; Make sure we don't propagate any memprof/callsite metadata +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef i8* @notprofiled +define dso_local noundef i8* @notprofiled() #0 !dbg !66 { +entry: + ; CHECK: call {{.*}} @_Znam + ; CHECK-NOT: !memprof + ; CHECK-NOT: !callsite + %call = call noundef i8* @_Z3foov(), !dbg !67 + ; CHECK-NEXT: ret + ret i8* %call, !dbg !68 +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, i8** noundef %argv) #3 !dbg !69 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + %c = alloca i8*, align 8 + %d = alloca i8*, align 8 + %e = alloca i8*, align 8 + %f = alloca i8*, align 8 + store i32 0, i32* %retval, align 4 + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + ;; The below 4 callsites are all annotated as noinline + %call = call noundef i8* @_Z3foov() #8, !dbg !70, !callsite !71 + store i8* %call, i8** %c, align 8, !dbg !72 + %call1 = call noundef i8* @_Z3foov() #8, !dbg !73, !callsite !74 + store i8* %call1, i8** %d, align 8, !dbg !75 + %call2 = call noundef i8* @_Z3barv() #8, !dbg !76, !callsite !77 + store i8* %call2, i8** %e, align 8, !dbg !78 + %call3 = call noundef i8* @_Z3bazv() #8, !dbg !79, !callsite !80 + store i8* %call3, i8** %f, align 8, !dbg !81 + %0 = load i8*, i8** %c, align 8, !dbg !82 + call void @llvm.memset.p0i8.i64(i8* align 1 %0, i8 0, i64 10, i1 false), !dbg !83 + %1 = load i8*, i8** %d, align 8, !dbg !84 + call void @llvm.memset.p0i8.i64(i8* align 1 %1, i8 0, i64 10, i1 false), !dbg !85 + %2 = load i8*, i8** %e, align 8, !dbg !86 + call void @llvm.memset.p0i8.i64(i8* align 1 %2, i8 0, i64 10, i1 false), !dbg !87 + %3 = load i8*, i8** %f, align 8, !dbg !88 + call void @llvm.memset.p0i8.i64(i8* align 1 %3, i8 0, i64 10, i1 false), !dbg !89 + %4 = load i8*, i8** %c, align 8, !dbg !90 + %isnull = icmp eq i8* %4, null, !dbg !91 + br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !91 + +delete.notnull: ; preds = %entry + call void @_ZdaPv(i8* noundef %4) #9, !dbg !92 + br label %delete.end, !dbg !92 + +delete.end: ; preds = %delete.notnull, %entry + %call4 = call i32 @sleep(i32 noundef 200), !dbg !94 + %5 = load i8*, i8** %d, align 8, !dbg !95 + %isnull5 = icmp eq i8* %5, null, !dbg !96 + br i1 %isnull5, label %delete.end7, label %delete.notnull6, !dbg !96 + +delete.notnull6: ; preds = %delete.end + call void @_ZdaPv(i8* noundef %5) #9, !dbg !97 + br label %delete.end7, !dbg !97 + +delete.end7: ; preds = %delete.notnull6, %delete.end + %6 = load i8*, i8** %e, align 8, !dbg !98 + %isnull8 = icmp eq i8* %6, null, !dbg !99 + br i1 %isnull8, label %delete.end10, label %delete.notnull9, !dbg !99 + +delete.notnull9: ; preds = %delete.end7 + call void @_ZdaPv(i8* noundef %6) #9, !dbg !100 + br label %delete.end10, !dbg !100 + +delete.end10: ; preds = %delete.notnull9, %delete.end7 + %7 = load i8*, i8** %f, align 8, !dbg !101 + %isnull11 = icmp eq i8* %7, null, !dbg !102 + br i1 %isnull11, label %delete.end13, label %delete.notnull12, !dbg !102 + +delete.notnull12: ; preds = %delete.end10 + call void @_ZdaPv(i8* noundef %7) #9, !dbg !103 + br label %delete.end13, !dbg !103 + +delete.end13: ; preds = %delete.notnull12, %delete.end10 + ret i32 0, !dbg !104 +} + +; Function Attrs: argmemonly nofree nounwind willreturn writeonly +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(i8* noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { argmemonly nofree nounwind willreturn writeonly } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { noinline } +attributes #9 = { builtin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8, !9} +!llvm.ident = !{!38} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof_inline.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "8711f6fd269e6cb5611fef48bc906eab") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ProfileSummary", !10} +!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20} +!11 = !{!"ProfileFormat", !"InstrProf"} +!12 = !{!"TotalCount", i64 0} +!13 = !{!"MaxCount", i64 0} +!14 = !{!"MaxInternalCount", i64 0} +!15 = !{!"MaxFunctionCount", i64 0} +!16 = !{!"NumCounts", i64 0} +!17 = !{!"NumFunctions", i64 0} +!18 = !{!"IsPartialProfile", i64 0} +!19 = !{!"PartialProfileRatio", double 0.000000e+00} +!20 = !{!"DetailedSummary", !21} +!21 = !{!22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37} +!22 = !{i32 10000, i64 0, i32 0} +!23 = !{i32 100000, i64 0, i32 0} +!24 = !{i32 200000, i64 0, i32 0} +!25 = !{i32 300000, i64 0, i32 0} +!26 = !{i32 400000, i64 0, i32 0} +!27 = !{i32 500000, i64 0, i32 0} +!28 = !{i32 600000, i64 0, i32 0} +!29 = !{i32 700000, i64 0, i32 0} +!30 = !{i32 800000, i64 0, i32 0} +!31 = !{i32 900000, i64 0, i32 0} +!32 = !{i32 950000, i64 0, i32 0} +!33 = !{i32 990000, i64 0, i32 0} +!34 = !{i32 999000, i64 0, i32 0} +!35 = !{i32 999900, i64 0, i32 0} +!36 = !{i32 999990, i64 0, i32 0} +!37 = !{i32 999999, i64 0, i32 0} +!38 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)"} +!39 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!40 = !DISubroutineType(types: !41) +!41 = !{} +!42 = !DILocation(line: 5, column: 10, scope: !39) +;; The first 2 are from the direct calls to foo from main. Those stay on the +;; callsite in foo, which isn't inlined into main due to the callsites in main +;; being annotated as noinline. +;; The second 2 are from the calls from foo2, which inlines its callsite to foo +;; but is not itself inlined into its callers. Therefore they get moved to a +;; new memprof metadata within foo2. +!43 = !{!44, !46, !48, !50} +!44 = !{!45, !"cold"} +!45 = !{i64 -2458008693472584243, i64 7394638144382192936} +!46 = !{!47, !"noncold"} +!47 = !{i64 -2458008693472584243, i64 -8908997186479157179} +!48 = !{!49, !"noncold"} +!49 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872} +!50 = !{!51, !"cold"} +!51 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905} +; CHECK: ![[ORIGMEMPROF]] = !{![[ORIGMIB1:[0-9]+]], ![[ORIGMIB2:[0-9]+]]} +; CHECK: ![[ORIGMIB1]] = !{![[ORIGMIBSTACK1:[0-9]+]], !"cold"} +; CHECK: ![[ORIGMIBSTACK1]] = !{i64 -2458008693472584243, i64 7394638144382192936} +; CHECK: ![[ORIGMIB2]] = !{![[ORIGMIBSTACK2:[0-9]+]], !"notcold"} +; CHECK: ![[ORIGMIBSTACK2]] = !{i64 -2458008693472584243, i64 -8908997186479157179} +; CHECK: ![[NEWMEMPROF]] = !{![[NEWMIB1:[0-9]+]], ![[NEWMIB2:[0-9]+]]} +; CHECK: ![[NEWMIB1]] = !{![[NEWMIBSTACK1:[0-9]+]], !"notcold"} +; CHECK: ![[NEWMIBSTACK1]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872} +; CHECK: ![[NEWMIB2]] = !{![[NEWMIBSTACK2:[0-9]+]], !"cold"} +; CHECK: ![[NEWMIBSTACK2]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905} +; CHECK: ![[NEWCALLSITE]] = !{i64 -2458008693472584243, i64 -8079659623765193173} +!52 = !{i64 -2458008693472584243} +!53 = !DILocation(line: 5, column: 3, scope: !39) +!54 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !40, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!55 = !DILocation(line: 8, column: 10, scope: !54) +!56 = !{i64 -8079659623765193173} +!57 = !DILocation(line: 8, column: 3, scope: !54) +!58 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 10, type: !40, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!59 = !DILocation(line: 11, column: 10, scope: !58) +!60 = !{i64 -972865200055133905} +!61 = !DILocation(line: 11, column: 3, scope: !58) +!62 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 13, type: !40, scopeLine: 13, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!63 = !DILocation(line: 14, column: 10, scope: !62) +!64 = !{i64 -4805294506621015872} +!65 = !DILocation(line: 14, column: 3, scope: !62) +!66 = distinct !DISubprogram(name: "notprofiled", linkageName: "notprofiled", scope: !1, file: !1, line: 400, type: !40, scopeLine: 400, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!67 = !DILocation(line: 401, column: 10, scope: !66) +!68 = !DILocation(line: 401, column: 3, scope: !66) +!69 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 16, type: !40, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!70 = !DILocation(line: 17, column: 13, scope: !69) +!71 = !{i64 -8908997186479157179} +!72 = !DILocation(line: 17, column: 9, scope: !69) +!73 = !DILocation(line: 18, column: 13, scope: !69) +!74 = !{i64 7394638144382192936} +!75 = !DILocation(line: 18, column: 9, scope: !69) +!76 = !DILocation(line: 19, column: 13, scope: !69) +!77 = !{i64 -5510257407004945023} +!78 = !DILocation(line: 19, column: 9, scope: !69) +!79 = !DILocation(line: 20, column: 13, scope: !69) +!80 = !{i64 8771588133652501463} +!81 = !DILocation(line: 20, column: 9, scope: !69) +!82 = !DILocation(line: 21, column: 10, scope: !69) +!83 = !DILocation(line: 21, column: 3, scope: !69) +!84 = !DILocation(line: 22, column: 10, scope: !69) +!85 = !DILocation(line: 22, column: 3, scope: !69) +!86 = !DILocation(line: 23, column: 10, scope: !69) +!87 = !DILocation(line: 23, column: 3, scope: !69) +!88 = !DILocation(line: 24, column: 10, scope: !69) +!89 = !DILocation(line: 24, column: 3, scope: !69) +!90 = !DILocation(line: 25, column: 12, scope: !69) +!91 = !DILocation(line: 25, column: 3, scope: !69) +!92 = !DILocation(line: 25, column: 3, scope: !93) +!93 = !DILexicalBlockFile(scope: !69, file: !1, discriminator: 2) +!94 = !DILocation(line: 26, column: 3, scope: !69) +!95 = !DILocation(line: 27, column: 12, scope: !69) +!96 = !DILocation(line: 27, column: 3, scope: !69) +!97 = !DILocation(line: 27, column: 3, scope: !93) +!98 = !DILocation(line: 28, column: 12, scope: !69) +!99 = !DILocation(line: 28, column: 3, scope: !69) +!100 = !DILocation(line: 28, column: 3, scope: !93) +!101 = !DILocation(line: 29, column: 12, scope: !69) +!102 = !DILocation(line: 29, column: 3, scope: !69) +!103 = !DILocation(line: 29, column: 3, scope: !93) +!104 = !DILocation(line: 30, column: 3, scope: !69)