diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -260,7 +260,8 @@ InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], - uint32_t &ActualNumValueData, uint64_t &TotalC); + uint32_t &ActualNumValueData, uint64_t &TotalC, + bool GetZeroCntValue = false); inline StringRef getPGOFuncNameMetadataName() { return "PGOFuncName"; } diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp --- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp +++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp @@ -45,7 +45,7 @@ // Set the maximum number of targets to promote for a single indirect-call // callsite. -static cl::opt +cl::opt MaxNumPromotions("icp-max-prom", cl::init(3), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of promotions for a single indirect " "call callsite")); diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -982,7 +982,8 @@ InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], - uint32_t &ActualNumValueData, uint64_t &TotalC) { + uint32_t &ActualNumValueData, uint64_t &TotalC, + bool GetZeroCntValue) { MDNode *MD = Inst.getMetadata(LLVMContext::MD_prof); if (!MD) return false; @@ -1009,7 +1010,7 @@ // Get total count ConstantInt *TotalCInt = mdconst::dyn_extract(MD->getOperand(2)); - if (!TotalCInt) + if (!TotalCInt && !GetZeroCntValue) return false; TotalC = TotalCInt->getZExtValue(); @@ -1021,7 +1022,7 @@ ConstantInt *Value = mdconst::dyn_extract(MD->getOperand(I)); ConstantInt *Count = mdconst::dyn_extract(MD->getOperand(I + 1)); - if (!Value || !Count) + if (!Value || (!Count && !GetZeroCntValue)) return false; ValueData[ActualNumValueData].Value = Value->getZExtValue(); ValueData[ActualNumValueData].Count = Count->getZExtValue(); diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -218,6 +218,8 @@ "by inlining from sample profile loader."), cl::Hidden); +extern cl::opt MaxNumPromotions; + namespace { using BlockWeightMap = DenseMap; @@ -361,8 +363,7 @@ // Attempt to promote indirect call and also inline the promoted call bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, - uint64_t &Sum, DenseSet &PromotedInsns, - SmallVector *InlinedCallSites = nullptr); + uint64_t &Sum, SmallVector *InlinedCallSites = nullptr); bool inlineHotFunctions(Function &F, DenseSet &InlinedGUIDs); InlineCost shouldInlineCandidate(InlineCandidate &Candidate); @@ -688,18 +689,97 @@ return it.first->second; } +/// If the profile count for the promotion candidate \p Candidate is 0, +/// it means \p Candidate has already been promoted for \p Inst. +static bool isPromotedBefore(const Instruction &Inst, StringRef Candidate) { + uint32_t NumVals = 0; + uint64_t TotalCount = 0; + std::unique_ptr ValueData = + std::make_unique(MaxNumPromotions); + bool Valid = + getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, + ValueData.get(), NumVals, TotalCount, true); + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) { + // If the promotion candidate has 0 count in the metadata, it + // means the candidate has been promoted for this indirect call. + if (ValueData[I].Value == Function::getGUID(Candidate)) + return ValueData[I].Count == 0; + } + } + return false; +} + +/// Update indirect call target profile metadata for \p Inst. If \p Total +/// is given, set TotalCount of call targets counts to \p Total, otherwise +/// keep the original value in metadata. +static void +updateIDTMetaData(Instruction &Inst, + const SmallVectorImpl &CallTargets, + uint64_t Total = 0) { + DenseMap ValueCountMap; + + uint32_t NumVals = 0; + uint64_t TotalCount = 0; + std::unique_ptr ValueData = + std::make_unique(MaxNumPromotions); + bool Valid = + getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, + ValueData.get(), NumVals, TotalCount, true); + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) + ValueCountMap[ValueData[I].Value] = ValueData[I].Count; + } + + for (const auto &Data : CallTargets) { + auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count); + if (Pair.second) + continue; + // Update existing profile count of the call target if it is not 0. + // If it is 0, the call target has been promoted so keep it as 0. + if (Pair.first->second != 0) + Pair.first->second = Data.Count; + else { + assert(Total >= Data.Count && "Total should be >= Data.Count"); + Total -= Data.Count; + } + } + + SmallVector NewCallTargets; + for (const auto &ValueCount : ValueCountMap) { + NewCallTargets.emplace_back( + InstrProfValueData{ValueCount.first, ValueCount.second}); + } + llvm::sort(NewCallTargets, + [](const InstrProfValueData &L, const InstrProfValueData &R) { + if (L.Count != R.Count) + return L.Count > R.Count; + return L.Value > R.Value; + }); + annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst, + NewCallTargets, Total ? Total : TotalCount, + IPVK_IndirectCallTarget, NewCallTargets.size()); +} + /// Attempt to promote indirect call and also inline the promoted call. /// /// \param F Caller function. /// \param Candidate ICP and inline candidate. /// \param Sum Sum of target counts for indirect call. -/// \param PromotedInsns Map to keep track of indirect call already processed. /// \param InlinedCallSite Output vector for new call sites exposed after /// inlining. bool SampleProfileLoader::tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, - DenseSet &PromotedInsns, SmallVector *InlinedCallSite) { + auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName(); + auto R = SymbolMap.find(CalleeFunctionName); + if (R == SymbolMap.end() || !R->getValue()) + return false; + + auto &CI = *Candidate.CallInstr; + if (isPromotedBefore(CI, R->getValue()->getName())) + return false; + const char *Reason = "Callee function not available"; // R->getValue() != &F is to prevent promoting a recursive call. // If it is a recursive call, we do not inline it as it could bloat @@ -707,15 +787,17 @@ // clone the caller first, and inline the cloned caller if it is // recursive. As llvm does not inline recursive calls, we will // simply ignore it instead of handling it explicitly. - auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName()); - if (R != SymbolMap.end() && R->getValue() && - !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && + if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && R->getValue()->hasFnAttribute("use-sample-profile") && - R->getValue() != &F && - isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) { - auto *DI = - &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(), - Candidate.CallsiteCount, Sum, false, ORE); + R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) { + // For promoted target, save 0 count in the value profile metadata so + // the target won't be promoted again. + SmallVector SortedCallTargets = { + InstrProfValueData{Function::getGUID(R->getValue()->getName()), 0}}; + updateIDTMetaData(CI, SortedCallTargets); + + auto *DI = &pgo::promoteIndirectCall( + CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE); if (DI) { Sum -= Candidate.CallsiteCount; // Prorate the indirect callsite distribution. @@ -724,10 +806,8 @@ // profile will be used to prorate callsites from the callee if // inlined. Once not inlined, the direct callsite distribution should // be prorated so that the it will reflect the real callsite counts. - setProbeDistributionFactor(*Candidate.CallInstr, - Candidate.CallsiteDistribution * Sum / - SumOrigin); - PromotedInsns.insert(Candidate.CallInstr); + setProbeDistributionFactor(CI, Candidate.CallsiteDistribution * Sum / + SumOrigin); Candidate.CallInstr = DI; if (isa(DI) || isa(DI)) { bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite); @@ -800,8 +880,6 @@ /// \returns True if there is any inline happened. bool SampleProfileLoader::inlineHotFunctions( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; - // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure // Profile symbol list is ignored when profile-sample-accurate is on. assert((!ProfAccForSymsInList || @@ -855,8 +933,6 @@ if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; @@ -869,8 +945,7 @@ continue; Candidate = {I, FS, FS->getEntrySamples(), 1.0}; - if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns)) { + if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { LocalNotInlinedCallSites.erase(I); LocalChanged = true; } @@ -1079,7 +1154,6 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure @@ -1128,8 +1202,6 @@ if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); uint64_t SumOrigin = Sum; @@ -1164,7 +1236,7 @@ Candidate = {I, FS, EntryCountDistributed, Candidate.CallsiteDistribution}; if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns, &InlinedCallSites)) { + &InlinedCallSites)) { for (auto *CB : InlinedCallSites) { if (getInlineCandidate(&NewCandidate, CB)) CQueue.emplace(NewCandidate); @@ -1261,9 +1333,7 @@ Sum += NameFS.second.getEntrySamples(); } } - annotateValueSite(*I.getParent()->getParent()->getParent(), I, - SortedCallTargets, Sum, IPVK_IndirectCallTarget, - SortedCallTargets.size()); + updateIDTMetaData(I, SortedCallTargets, Sum); } else if (!isa(&I)) { I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights( diff --git a/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof @@ -0,0 +1,13 @@ +_Z3foov:225715:1 + 2: 5553 + 3: 5391 + 1: _Z3goov:5860 + 1: 5279 + 2: 5279 + 1: _Z3hoov:5860 + 1: 5000 +_Z3goov:5860:1 + 1: 5279 + 2: 5279 + 1: _Z3hoov:5860 + 1: 5000 diff --git a/llvm/test/Transforms/SampleProfile/indirect-call.ll b/llvm/test/Transforms/SampleProfile/indirect-call.ll --- a/llvm/test/Transforms/SampleProfile/indirect-call.ll +++ b/llvm/test/Transforms/SampleProfile/indirect-call.ll @@ -202,7 +202,7 @@ ; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398} ; CHECK: ![[BR1]] = !{!"branch_weights", i32 4000, i32 4000} ; CHECK: ![[BR2]] = !{!"branch_weights", i32 3000, i32 1000} -; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000} +; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000, i64 7476224446746900038, i64 0, i64 925324185419832389, i64 0} ; CHECK: ![[BR3]] = !{!"branch_weights", i32 1, i32 0} !6 = distinct !DISubprogram(name: "test_inline", scope: !1, file: !1, line: 6, unit: !0) !7 = !DILocation(line: 7, scope: !6) diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/norepeated-icp.prof -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [5 x i8] c"hoo\0A\00", align 1 +@p = dso_local global void ()* null, align 8 +@str = private unnamed_addr constant [4 x i8] c"hoo\00", align 1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3hoov() #0 !dbg !7 { +entry: + %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @str, i64 0, i64 0)), !dbg !9 + ret void, !dbg !10 +} + +; Function Attrs: nofree nounwind +declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) #1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3goov() #0 !dbg !11 { +entry: + %0 = load void ()*, void ()** @p, align 8, !dbg !12, !tbaa !13 + call void %0(), !dbg !17 + ret void, !dbg !18 +} + +; Check the indirect call in _Z3goov inlined into _Z3foov won't be indirect +; call promoted for _Z3hoov twice in _Z3foov. +; CHECK-LABEL: @_Z3foov( +; CHECK: icmp eq void ()* {{.*}} @_Z3hoov +; CHECK-NOT: icmp eq void ()* {{.*}} @_Z3hoov +; CHECK: ret void + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3foov() #0 !dbg !19 { +entry: + call void @_Z3goov(), !dbg !20 + ret void, !dbg !21 +} + +; Function Attrs: nofree nounwind +declare noundef i32 @puts(i8* nocapture noundef readonly) #2 + +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" } +attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nofree nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "1.cc", directory: "") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!""} +!7 = distinct !DISubprogram(name: "hoo", linkageName: "_Z3hoov", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 2, column: 3, scope: !7) +!10 = !DILocation(line: 3, column: 1, scope: !7) +!11 = distinct !DISubprogram(name: "goo", linkageName: "_Z3goov", scope: !1, file: !1, line: 6, type: !8, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!12 = !DILocation(line: 7, column: 5, scope: !11) +!13 = !{!14, !14, i64 0} +!14 = !{!"any pointer", !15, i64 0} +!15 = !{!"omnipotent char", !16, i64 0} +!16 = !{!"Simple C++ TBAA"} +!17 = !DILocation(line: 7, column: 3, scope: !11) +!18 = !DILocation(line: 8, column: 1, scope: !11) +!19 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !8, scopeLine: 10, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!20 = !DILocation(line: 11, column: 3, scope: !19) +!21 = !DILocation(line: 12, column: 3, scope: !19)