diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp --- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp +++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp @@ -45,7 +45,7 @@ // Set the maximum number of targets to promote for a single indirect-call // callsite. -cl::opt +static cl::opt MaxNumPromotions("icp-max-prom", cl::init(3), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of promotions for a single indirect " "call callsite")); diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -218,7 +218,11 @@ "by inlining from sample profile loader."), cl::Hidden); -extern cl::opt MaxNumPromotions; +static cl::opt + MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, + cl::ZeroOrMore, + cl::desc("Max number of promotions for a single indirect " + "call callsite in sample profile loader")); namespace { @@ -363,8 +367,7 @@ // Attempt to promote indirect call and also inline the promoted call bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, - uint64_t &Sum, DenseSet &PromotedInsns, - SmallVector *InlinedCallSites = nullptr); + uint64_t &Sum, SmallVector *InlinedCallSites = nullptr); bool inlineHotFunctions(Function &F, DenseSet &InlinedGUIDs); InlineCost shouldInlineCandidate(InlineCandidate &Candidate); @@ -690,9 +693,13 @@ return it.first->second; } +/// Check whether the indirect call promotion history of \p Inst allows +/// the promotion for \p Candidate. /// If the profile count for the promotion candidate \p Candidate is 0, /// it means \p Candidate has already been promoted for \p Inst. -static bool isPromotedBefore(const Instruction &Inst, StringRef Candidate) { +/// If we already have at least MaxNumPromotions zero count values in the +/// value profile of \p Inst, we cannot promote for it anymore. +static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) { uint32_t NumVals = 0; uint64_t TotalCount = 0; std::unique_ptr ValueData = @@ -700,15 +707,61 @@ bool Valid = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, ValueData.get(), NumVals, TotalCount, true); + unsigned NumPromoted = 0; if (Valid) { for (uint32_t I = 0; I < NumVals; I++) { - // If the promotion candidate has 0 count in the metadata, it - // means the candidate has been promoted for this indirect call. - if (ValueData[I].Value == Function::getGUID(Candidate)) - return ValueData[I].Count == 0; + if (ValueData[I].Count == 0) { + // If the promotion candidate has 0 count in the metadata, it + // means the candidate has been promoted for this indirect call. + if (ValueData[I].Value == Function::getGUID(Candidate)) + return false; + NumPromoted++; + } } } - return false; + // If already have MaxNumPromotions promotion, don't do it anymore. + return NumPromoted < MaxNumPromotions; +} + +// Select MaxNumPromotions elements from CallTargets. Select zero count +// values first, then largest count value. Drop the rest if the total +// number of elements are larger than MaxNumPromotions. +static void selectCallTargets(SmallVectorImpl &CallTargets, + unsigned MaxNumPromotions, uint64_t &Total) { + if (CallTargets.size() <= MaxNumPromotions) + return; + + unsigned Num = 0; + auto BeginIt = CallTargets.begin(); + auto EndIt = CallTargets.end(); + // Iteration starts from the last element in CallTargets. + // First select 0 count value in the end. + auto It = BeginIt + (CallTargets.size() - 1); + while (It != CallTargets.begin() && It->Count == 0 && + Num < MaxNumPromotions) { + EndIt = It; + It--; + Num++; + } + + // Then select largest count value at the beginning. + while (BeginIt != EndIt && Num < MaxNumPromotions) { + BeginIt++; + Num++; + } + + // We should have selected MaxNumPromotions elements. Drop the rest. + // Update Total count if Total > 0. + if (Total > 0) { + uint64_t Substract = std::accumulate( + BeginIt, EndIt, 0, + [](uint64_t A, const InstrProfValueData &B) { return A + B.Count; }); + assert(Total >= Substract && "Expect Total to be larger than Substract"); + Total -= Substract; + } + CallTargets.erase(BeginIt, EndIt); + assert(CallTargets.size() == MaxNumPromotions && + "Expect CallTargets to have MaxNumPromotions elements"); } /// Update indirect call target profile metadata for \p Inst. If \p Total @@ -728,8 +781,14 @@ getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, ValueData.get(), NumVals, TotalCount, true); if (Valid) { - for (uint32_t I = 0; I < NumVals; I++) + for (uint32_t I = 0; I < NumVals; I++) { + // If it is try to reset the VP metadata with the new annotated profile + // (Total != 0 indicates that), skip all the non-zero value because they + // are all set with the old profile. + if (Total != 0 && ValueData[I].Count != 0) + continue; ValueCountMap[ValueData[I].Value] = ValueData[I].Count; + } } for (const auto &Data : CallTargets) { @@ -757,6 +816,7 @@ return L.Count > R.Count; return L.Value > R.Value; }); + selectCallTargets(NewCallTargets, MaxNumPromotions, Total); annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst, NewCallTargets, Total ? Total : TotalCount, IPVK_IndirectCallTarget, NewCallTargets.size()); @@ -767,12 +827,10 @@ /// \param F Caller function. /// \param Candidate ICP and inline candidate. /// \param Sum Sum of target counts for indirect call. -/// \param PromotedInsns Map to keep track of indirect call already processed. /// \param InlinedCallSite Output vector for new call sites exposed after /// inlining. bool SampleProfileLoader::tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, - DenseSet &PromotedInsns, SmallVector *InlinedCallSite) { auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName(); auto R = SymbolMap.find(CalleeFunctionName); @@ -780,7 +838,7 @@ return false; auto &CI = *Candidate.CallInstr; - if (isPromotedBefore(CI, R->getValue()->getName())) + if (!doesHistoryAllowICP(CI, R->getValue()->getName())) return false; const char *Reason = "Callee function not available"; @@ -811,7 +869,6 @@ // be prorated so that the it will reflect the real callsite counts. setProbeDistributionFactor(CI, Candidate.CallsiteDistribution * Sum / SumOrigin); - PromotedInsns.insert(Candidate.CallInstr); Candidate.CallInstr = DI; if (isa(DI) || isa(DI)) { bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite); @@ -884,8 +941,6 @@ /// \returns True if there is any inline happened. bool SampleProfileLoader::inlineHotFunctions( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; - // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure // Profile symbol list is ignored when profile-sample-accurate is on. assert((!ProfAccForSymsInList || @@ -939,8 +994,6 @@ if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; @@ -953,8 +1006,7 @@ continue; Candidate = {I, FS, FS->getEntrySamples(), 1.0}; - if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns)) { + if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { LocalNotInlinedCallSites.erase(I); LocalChanged = true; } @@ -1163,7 +1215,6 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure @@ -1212,8 +1263,6 @@ if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); uint64_t SumOrigin = Sum; @@ -1248,7 +1297,7 @@ Candidate = {I, FS, EntryCountDistributed, Candidate.CallsiteDistribution}; if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns, &InlinedCallSites)) { + &InlinedCallSites)) { for (auto *CB : InlinedCallSites) { if (getInlineCandidate(&NewCandidate, CB)) CQueue.emplace(NewCandidate); diff --git a/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-2.prof b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-2.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-2.prof @@ -0,0 +1,16 @@ +_Z3goov:5860:1 + 1: 5279 _Z3foov:2000 _Z3barv:1000 + 2: 5279 _Z3foov:2000 _Z3barv:1000 + 3: 5279 _Z3foov:2000 _Z3barv:1000 + 1: _Z3hoov:5860 + 1: 5000 + 1: _Z3moov:5860 + 1: 5000 + 2: _Z3hoov:5860 + 1: 5000 + 2: _Z3moov:5860 + 1: 5000 + 3: _Z3hoov:5860 + 1: 5000 + 3: _Z3moov:5860 + 1: 5000 diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll @@ -0,0 +1,124 @@ +; RUN: opt < %s -sample-profile-icp-max-prom=2 -passes=sample-profile -sample-profile-file=%S/Inputs/norepeated-icp-2.prof -S | FileCheck %s --check-prefix=MAX2 +; RUN: opt < %s -sample-profile-icp-max-prom=4 -passes=sample-profile -sample-profile-file=%S/Inputs/norepeated-icp-2.prof -S | FileCheck %s --check-prefix=MAX4 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [5 x i8] c"moo\0A\00", align 1 +@p = dso_local global void ()* null, align 8 +@cond = dso_local global i8 0, align 1 +@str = private unnamed_addr constant [4 x i8] c"moo\00", align 1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3moov() #0 !dbg !7 { +entry: + %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @str, i64 0, i64 0)), !dbg !9 + ret void, !dbg !10 +} + +; Function Attrs: nofree nounwind +declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) #1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3hoov() #0 !dbg !11 { +entry: + %0 = load volatile i8, i8* @cond, align 1, !dbg !12, !range !17 + %tobool.not = icmp eq i8 %0, 0, !dbg !12 + br i1 %tobool.not, label %if.end, label %if.then, !dbg !12 + +if.then: ; preds = %entry + call void @_Z10hoo_calleev(), !dbg !18 + br label %if.end, !dbg !18 + +if.end: ; preds = %if.then, %entry + store void ()* @_Z3moov, void ()** @p, align 8, !dbg !19 + ret void, !dbg !22 +} + +declare !dbg !23 dso_local void @_Z10hoo_calleev() #2 + +; MAX2-LABEL: @_Z3goov( +; MAX2: icmp eq void ()* {{.*}} @_Z3hoov +; MAX2: call void %t0(), {{.*}} !prof ![[PROF_ID1:[0-9]+]] +; MAX2-NOT: icmp eq void ()* {{.*}} @_Z3hoov +; MAX2-NOT: icmp eq void ()* {{.*}} @_Z3moov +; MAX2: call void %t1(), {{.*}} !prof ![[PROF_ID2:[0-9]+]] +; MAX2-NOT: icmp eq void ()* {{.*}} @_Z3hoov +; MAX2-NOT: icmp eq void ()* {{.*}} @_Z3moov +; MAX2: call void %t2(), {{.*}} !prof ![[PROF_ID2:[0-9]+]] +; MAX2: ret void +; MAX4-LABEL: @_Z3goov( +; MAX4: icmp eq void ()* {{.*}} @_Z3hoov +; MAX4: icmp eq void ()* {{.*}} @_Z3moov +; MAX4: call void %t0(), {{.*}} !prof ![[PROF_ID3:[0-9]+]] +; MAX4: icmp eq void ()* {{.*}} @_Z3hoov +; MAX4: icmp eq void ()* {{.*}} @_Z3moov +; MAX4: call void %t1(), {{.*}} !prof ![[PROF_ID4:[0-9]+]] +; MAX4-NOT: icmp eq void ()* {{.*}} @_Z3hoov +; MAX4-NOT: icmp eq void ()* {{.*}} @_Z3moov +; MAX4: call void %t2(), {{.*}} !prof ![[PROF_ID5:[0-9]+]] +; MAX4: ret void + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3goov() #0 !dbg !24 { +entry: + %t0 = load void ()*, void ()** @p, align 8, !dbg !25 + call void %t0(), !dbg !26, !prof !30 + %t1 = load void ()*, void ()** @p, align 8, !dbg !25 + call void %t1(), !dbg !28, !prof !31 + %t2 = load void ()*, void ()** @p, align 8, !dbg !25 + call void %t2(), !dbg !29, !prof !32 + ret void, !dbg !27 +} + +; MAX2: ![[PROF_ID1]] = !{!"VP", i32 0, i64 10000, i64 -7701940972712279918, i64 0, i64 1850239051784516332, i64 0} +; MAX2: ![[PROF_ID2]] = !{!"VP", i32 0, i64 10000, i64 3137940972712279918, i64 0, i64 1850239051784516332, i64 0} +; MAX4: ![[PROF_ID3]] = !{!"VP", i32 0, i64 12000, i64 9191153033785521275, i64 2000, i64 -7383239051784516332, i64 0, i64 -7701940972712279918, i64 0, i64 1850239051784516332, i64 0} +; MAX4: ![[PROF_ID4]] = !{!"VP", i32 0, i64 10000, i64 -7383239051784516332, i64 0, i64 -7701940972712279918, i64 0, i64 3137940972712279918, i64 0, i64 1850239051784516332, i64 0} +; MAX4: ![[PROF_ID5]] = !{!"VP", i32 0, i64 10000, i64 4128940972712279918, i64 0, i64 3137940972712279918, i64 0, i64 2132940972712279918, i64 0, i64 1850239051784516332, i64 0} + +; Function Attrs: nofree nounwind +declare noundef i32 @puts(i8* nocapture noundef readonly) #3 + +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" } +attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nofree nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project.git f8226e6e284e9f199790bdb330f87d71adb5376f)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "1.cc", directory: "/usr/local/google/home/wmi/workarea/llvm/build/splitprofile") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 13.0.0 (https://github.com/llvm/llvm-project.git f8226e6e284e9f199790bdb330f87d71adb5376f)"} +!7 = distinct !DISubprogram(name: "moo", linkageName: "_Z3moov", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 2, column: 3, scope: !7) +!10 = !DILocation(line: 3, column: 1, scope: !7) +!11 = distinct !DISubprogram(name: "hoo", linkageName: "_Z3hoov", scope: !1, file: !1, line: 9, type: !8, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!12 = !DILocation(line: 10, column: 7, scope: !11) +!13 = !{!14, !14, i64 0} +!14 = !{!"bool", !15, i64 0} +!15 = !{!"omnipotent char", !16, i64 0} +!16 = !{!"Simple C++ TBAA"} +!17 = !{i8 0, i8 2} +!18 = !DILocation(line: 11, column: 5, scope: !11) +!19 = !DILocation(line: 12, column: 5, scope: !11) +!20 = !{!21, !21, i64 0} +!21 = !{!"any pointer", !15, i64 0} +!22 = !DILocation(line: 13, column: 1, scope: !11) +!23 = !DISubprogram(name: "hoo_callee", linkageName: "_Z10hoo_calleev", scope: !1, file: !1, line: 5, type: !8, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2) +!24 = distinct !DISubprogram(name: "goo", linkageName: "_Z3goov", scope: !1, file: !1, line: 15, type: !8, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!25 = !DILocation(line: 16, column: 5, scope: !24) +!26 = !DILocation(line: 16, column: 3, scope: !24) +!27 = !DILocation(line: 19, column: 1, scope: !24) +!28 = !DILocation(line: 17, column: 3, scope: !24) +!29 = !DILocation(line: 18, column: 3, scope: !24) +!30 = !{!"VP", i32 0, i64 0, i64 1850239051784516332, i64 0} +!31 = !{!"VP", i32 0, i64 0, i64 1850239051784516332, i64 0, i64 3137940972712279918, i64 0} +!32 = !{!"VP", i32 0, i64 0, i64 1850239051784516332, i64 0, i64 3137940972712279918, i64 0, i64 2132940972712279918, i64 0, i64 4128940972712279918, i64 0}