Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -246,12 +246,10 @@ if (C) return isColdCount(*C); - // In SamplePGO, if the caller has been sampled, and there is no profile - // annotatedon the callsite, we consider the callsite as cold. - // If there is no profile for the caller, and we know the profile is - // accurate, we consider the callsite as cold. + // In SamplePGO if there is no profile for the callsite, and we know the + // profile is accurate, we consider the callsite as cold. return (hasSampleProfile() && - (CS.getCaller()->hasProfileData() || ProfileSampleAccurate || + (ProfileSampleAccurate || CS.getCaller()->hasFnAttribute("profile-sample-accurate"))); } Index: lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- lib/Transforms/IPO/SampleProfile.cpp +++ lib/Transforms/IPO/SampleProfile.cpp @@ -37,6 +37,7 @@ #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -186,7 +187,8 @@ IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); - bool runOnModule(Module &M, ModuleAnalysisManager *AM); + bool runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI); void dump() { Reader->dump(); } @@ -285,12 +287,23 @@ /// Instead, we will mark GUIDs that needs to be annotated to the function. bool IsThinLTOPreLink; + /// \brief Profile Summary Info computed from sample profile. + ProfileSummaryInfo *PSI = nullptr; + /// \brief Total number of samples collected in this profile. /// /// This is the sum of all the samples collected in all the functions executed /// at runtime. uint64_t TotalCollectedSamples = 0; + /// \brief For callsite which is inlined in the profile, we don't have + /// sample for the call instruction. Even if hotCallsite inliner cannot + /// inline such callsite because it is not hot enough, if only we think + /// it is not cold, we will save such callsite in the WarmCallsWithoutProf + /// set. We will pass the information to regular inliner to let it know + /// such callsite is not cold too. + DenseSet WarmCallsWithoutProf; + /// \brief Optimization Remark Emitter used to emit diagnostic remarks. OptimizationRemarkEmitter *ORE = nullptr; }; @@ -325,6 +338,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } private: @@ -470,6 +484,7 @@ Predecessors.clear(); Successors.clear(); CoverageTracker.clear(); + WarmCallsWithoutProf.clear(); } #ifndef NDEBUG @@ -769,6 +784,8 @@ Candidates.push_back(&I); if (callsiteIsHot(Samples, FS)) Hot = true; + else if (PSI && PSI->isHotCount(FS->getTotalSamples())) + WarmCallsWithoutProf.insert(&I); } } if (Hot) { @@ -1251,37 +1268,40 @@ for (auto &BI : F) { BasicBlock *BB = &BI; - if (BlockWeights[BB]) { - for (auto &I : BB->getInstList()) { - if (!isa(I) && !isa(I)) + for (auto &I : BB->getInstList()) { + if (!isa(I) && !isa(I)) + continue; + CallSite CS(&I); + if (!CS.getCalledFunction()) { + const DebugLoc &DLoc = I.getDebugLoc(); + if (!DLoc) continue; - CallSite CS(&I); - if (!CS.getCalledFunction()) { - const DebugLoc &DLoc = I.getDebugLoc(); - if (!DLoc) - continue; - const DILocation *DIL = DLoc; - uint32_t LineOffset = FunctionSamples::getOffset(DIL); - uint32_t Discriminator = DIL->getBaseDiscriminator(); + const DILocation *DIL = DLoc; + uint32_t LineOffset = FunctionSamples::getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); - const FunctionSamples *FS = findFunctionSamples(I); - if (!FS) - continue; - auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); - if (!T || T.get().empty()) - continue; - SmallVector SortedCallTargets = - SortCallTargets(T.get()); - uint64_t Sum; - findIndirectCallFunctionSamples(I, Sum); - annotateValueSite(*I.getParent()->getParent()->getParent(), I, - SortedCallTargets, Sum, IPVK_IndirectCallTarget, - SortedCallTargets.size()); - } else if (!dyn_cast(&I)) { - SmallVector Weights; - Weights.push_back(BlockWeights[BB]); - I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); - } + const FunctionSamples *FS = findFunctionSamples(I); + if (!FS) + continue; + auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); + if (!T || T.get().empty()) + continue; + SmallVector SortedCallTargets = + SortCallTargets(T.get()); + uint64_t Sum; + findIndirectCallFunctionSamples(I, Sum); + annotateValueSite(*I.getParent()->getParent()->getParent(), I, + SortedCallTargets, Sum, IPVK_IndirectCallTarget, + SortedCallTargets.size()); + } else if (!dyn_cast(&I)) { + // If there is no sample in the BB, and the callsite is in + // WarmCallsWithoutProf set, leave the MD_prof field empty + // so that regular inliner will not treat it as cold. + if (BlockWeights[BB] == 0 && WarmCallsWithoutProf.count(&I)) + continue; + SmallVector Weights; + Weights.push_back(BlockWeights[BB]); + I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); } } TerminatorInst *TI = BB->getTerminator(); @@ -1496,6 +1516,7 @@ "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) @@ -1520,10 +1541,15 @@ return new SampleProfileLoaderLegacyPass(Name); } -bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) { +bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI) { if (!ProfileIsValid) return false; + PSI = _PSI; + if (M.getProfileSummary() == nullptr) + M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); + // Compute the total number of samples collected in this profile. for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); @@ -1554,15 +1580,15 @@ clearFunctionData(); retval |= runOnFunction(F, AM); } - if (M.getProfileSummary() == nullptr) - M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); return retval; } bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { ACT = &getAnalysis(); TTIWP = &getAnalysis(); - return SampleLoader.runOnModule(M, nullptr); + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + return SampleLoader.runOnModule(M, nullptr, PSI); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { @@ -1604,7 +1630,8 @@ SampleLoader.doInitialization(M); - if (!SampleLoader.runOnModule(M, &AM)) + ProfileSummaryInfo *PSI = &AM.getResult(M); + if (!SampleLoader.runOnModule(M, &AM, PSI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); Index: test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof =================================================================== --- test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof +++ test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof @@ -0,0 +1,11 @@ +main:2257150:0 + 2.1: 5553 + 3: 5391 + 3.1: foo:5860 + 0: 5279 + 1: 5279 + 2: 5279 + 4.1: goo:60 + 0: 20 + 1: 20 + 2: 20 Index: test/Transforms/SampleProfile/warm-inline-instance.ll =================================================================== --- test/Transforms/SampleProfile/warm-inline-instance.ll +++ test/Transforms/SampleProfile/warm-inline-instance.ll @@ -0,0 +1,117 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s + +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +define i32 @goo(i32 %x, i32 %y) { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* %i, align 4, !dbg !12 + br label %while.cond, !dbg !13 + +while.cond: ; preds = %if.end, %entry + %t0 = load i32, i32* %i, align 4, !dbg !14 + %inc = add nsw i32 %t0, 1, !dbg !14 + store i32 %inc, i32* %i, align 4, !dbg !14 + %cmp = icmp slt i32 %t0, 400000000, !dbg !14 + br i1 %cmp, label %while.body, label %while.end, !dbg !14 + +while.body: ; preds = %while.cond + %t1 = load i32, i32* %i, align 4, !dbg !16 + %cmp1 = icmp ne i32 %t1, 100, !dbg !16 + br i1 %cmp1, label %if.then, label %if.else, !dbg !16 + +if.then: ; preds = %while.body + %t2 = load i32, i32* %i, align 4, !dbg !18 + %t3 = load i32, i32* %s, align 4, !dbg !18 +; call @foo and call @goo below are not hot callsites so early inliner of afdo +; will not inline them. +; call @foo below is a warm callsite, so don't annotate its weight as 0 and +; regular inliner will not treat it as cold. +; CHECK-LABEL: @main( +; CHECK: call i32 @foo(i32 %t2, i32 %t3), !dbg !{{[0-9]+}} +; CHECK-NOT: !prof +; CHECK-SAME: {{$}} + %call1 = call i32 @foo(i32 %t2, i32 %t3), !dbg !18 + store i32 %call1, i32* %s, align 4, !dbg !18 + br label %if.end, !dbg !18 + +if.else: ; preds = %while.body +; call @goo below is a cold callsite, so annotate its weight as 0 and +; regular inliner will treat it as cold. +; CHECK: call i32 @goo(i32 2, i32 3), !dbg !{{[0-9]+}}, !prof [[PROF_LOC:![0-9]+]] +; CHECK: [[PROF_LOC]] = !{!"branch_weights", i32 0} + %call2 = call i32 @goo(i32 2, i32 3), !dbg !26 + store i32 %call2, i32* %s, align 4, !dbg !20 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond, !dbg !22 + +while.end: ; preds = %while.cond + %t4 = load i32, i32* %s, align 4, !dbg !24 + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %t4), !dbg !24 + ret i32 0, !dbg !25 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, variables: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7) +!26 = !DILocation(line: 11, scope: !19)