diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -254,7 +254,6 @@ CODEGENOPT(VectorizeLoop , 1, 0) ///< Run loop vectorizer. CODEGENOPT(VectorizeSLP , 1, 0) ///< Run SLP vectorizer. CODEGENOPT(ProfileSampleAccurate, 1, 0) ///< Sample profile is accurate. -CODEGENOPT(CallGraphProfile , 1, 0) ///< Run call graph profile. /// Attempt to use register sized accesses to bit-fields in structures, when /// possible. diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -620,6 +620,9 @@ PMBuilder.SizeLevel = CodeGenOpts.OptimizeSize; PMBuilder.SLPVectorize = CodeGenOpts.VectorizeSLP; PMBuilder.LoopVectorize = CodeGenOpts.VectorizeLoop; + // Only enable CGProfilePass when using integrated assembler, since + // non-integrated assemblers don't recognize .cgprofile section. + PMBuilder.CallGraphProfile = !CodeGenOpts.DisableIntegratedAS; PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops; // Loop interleaving in the loop vectorizer has historically been set to be @@ -1144,7 +1147,9 @@ PTO.LoopInterleaving = CodeGenOpts.UnrollLoops; PTO.LoopVectorization = CodeGenOpts.VectorizeLoop; PTO.SLPVectorization = CodeGenOpts.VectorizeSLP; - PTO.CallGraphProfile = CodeGenOpts.CallGraphProfile; + // Only enable CGProfilePass when using integrated assembler, since + // non-integrated assemblers don't recognize .cgprofile section. + PTO.CallGraphProfile = !CodeGenOpts.DisableIntegratedAS; PTO.Coroutines = LangOpts.Coroutines; PassInstrumentationCallbacks PIC; @@ -1562,7 +1567,9 @@ Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops; Conf.PTO.LoopVectorization = CGOpts.VectorizeLoop; Conf.PTO.SLPVectorization = CGOpts.VectorizeSLP; - Conf.PTO.CallGraphProfile = CGOpts.CallGraphProfile; + // Only enable CGProfilePass when using integrated assembler, since + // non-integrated assemblers don't recognize .cgprofile section. + Conf.PTO.CallGraphProfile = !CGOpts.DisableIntegratedAS; // Context sensitive profile. if (CGOpts.hasProfileCSIRInstr()) { diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -860,7 +860,6 @@ Opts.RerollLoops = Args.hasArg(OPT_freroll_loops); Opts.DisableIntegratedAS = Args.hasArg(OPT_fno_integrated_as); - Opts.CallGraphProfile = !Opts.DisableIntegratedAS; Opts.Autolink = !Args.hasArg(OPT_fno_autolink); Opts.SampleProfileFile = std::string(Args.getLastArgValue(OPT_fprofile_sample_use_EQ)); diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -103,6 +103,7 @@ void initializeCFIInstrInserterPass(PassRegistry&); void initializeCFLAndersAAWrapperPassPass(PassRegistry&); void initializeCFLSteensAAWrapperPassPass(PassRegistry&); +void initializeCGProfileLegacyPassPass(PassRegistry &); void initializeCallGraphDOTPrinterPass(PassRegistry&); void initializeCallGraphPrinterLegacyPassPass(PassRegistry&); void initializeCallGraphViewerPass(PassRegistry&); diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -282,6 +282,8 @@ ModulePass *createWriteThinLTOBitcodePass(raw_ostream &Str, raw_ostream *ThinLinkOS = nullptr); +ModulePass *createCGProfileLegacyPass(); + } // End llvm namespace #endif diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -156,6 +156,7 @@ bool DisableTailCalls; bool DisableUnrollLoops; + bool CallGraphProfile; bool SLPVectorize; bool LoopVectorize; bool LoopsInterleaved; diff --git a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h --- a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h +++ b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h @@ -19,11 +19,6 @@ class CGProfilePass : public PassInfoMixin { public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); - -private: - void addModuleFlags( - Module &M, - MapVector, uint64_t> &Counts) const; }; } // end namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -248,10 +248,6 @@ EnableCHR("enable-chr-npm", cl::init(true), cl::Hidden, cl::desc("Enable control height reduction optimization (CHR)")); -static cl::opt EnableCallGraphProfile( - "enable-npm-call-graph-profile", cl::init(true), cl::Hidden, - cl::desc("Enable call graph profile pass for the new PM (default = on)")); - /// Flag to enable inline deferral during PGO. static cl::opt EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), @@ -267,7 +263,7 @@ Coroutines = false; LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; - CallGraphProfile = EnableCallGraphProfile; + CallGraphProfile = true; } extern cl::opt EnableHotColdSplit; diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -195,6 +195,7 @@ PrepareForThinLTO = EnablePrepareForThinLTO; PerformThinLTO = EnablePerformThinLTO; DivergentTarget = false; + CallGraphProfile = true; } PassManagerBuilder::~PassManagerBuilder() { @@ -834,6 +835,10 @@ if (MergeFunctions) MPM.add(createMergeFunctionsPass()); + // Add Module flag "CG Profile" based on Branch Frequency Information. + if (CallGraphProfile) + MPM.add(createCGProfileLegacyPass()); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp --- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp +++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp @@ -10,22 +10,47 @@ #include "llvm/ADT/MapVector.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Instrumentation.h" #include using namespace llvm; -PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) { +static bool +addModuleFlags(Module &M, + MapVector, uint64_t> &Counts) { + if (Counts.empty()) + return false; + + LLVMContext &Context = M.getContext(); + MDBuilder MDB(Context); + std::vector Nodes; + + for (auto E : Counts) { + Metadata *Vals[] = {ValueAsMetadata::get(E.first.first), + ValueAsMetadata::get(E.first.second), + MDB.createConstant(ConstantInt::get( + Type::getInt64Ty(Context), E.second))}; + Nodes.push_back(MDNode::get(Context, Vals)); + } + + M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes)); + return true; +} + +static bool runCGProfilePass( + Module &M, function_ref GetBFI, + function_ref GetTTI, bool LazyBFI) { MapVector, uint64_t> Counts; - FunctionAnalysisManager &FAM = - MAM.getResult(M).getManager(); InstrProfSymtab Symtab; auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F, Function *CalledF, uint64_t NewCount) { @@ -35,14 +60,18 @@ Count = SaturatingAdd(Count, NewCount); }; // Ignore error here. Indirect calls are ignored if this fails. - (void)(bool)Symtab.create(M); + (void)(bool) Symtab.create(M); for (auto &F : M) { - if (F.isDeclaration()) + // Avoid extra cost of running passes for BFI when the function doesn't have + // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check + // if using LazyBlockFrequencyInfoPass. + // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM. + if (F.isDeclaration() || (LazyBFI && !F.getEntryCount())) continue; - auto &BFI = FAM.getResult(F); + auto &BFI = GetBFI(F); if (BFI.getEntryFreq() == 0) continue; - TargetTransformInfo &TTI = FAM.getResult(F); + TargetTransformInfo &TTI = GetTTI(F); for (auto &BB : F) { Optional BBCount = BFI.getBlockProfileCount(&BB); if (!BBCount) @@ -69,28 +98,56 @@ } } - addModuleFlags(M, Counts); - - return PreservedAnalyses::all(); + return addModuleFlags(M, Counts); } -void CGProfilePass::addModuleFlags( - Module &M, - MapVector, uint64_t> &Counts) const { - if (Counts.empty()) - return; +namespace { +struct CGProfileLegacyPass final : public ModulePass { + static char ID; + CGProfileLegacyPass() : ModulePass(ID) { + initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry()); + } - LLVMContext &Context = M.getContext(); - MDBuilder MDB(Context); - std::vector Nodes; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } - for (auto E : Counts) { - Metadata *Vals[] = {ValueAsMetadata::get(E.first.first), - ValueAsMetadata::get(E.first.second), - MDB.createConstant(ConstantInt::get( - Type::getInt64Ty(Context), E.second))}; - Nodes.push_back(MDNode::get(Context, Vals)); + bool runOnModule(Module &M) override { + auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & { + return this->getAnalysis(F).getBFI(); + }; + auto GetTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis().getTTI(F); + }; + + return runCGProfilePass(M, GetBFI, GetTTI, true); } +}; - M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes)); +} // namespace + +char CGProfileLegacyPass::ID = 0; + +INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false, + false) + +ModulePass *llvm::createCGProfileLegacyPass() { + return new CGProfileLegacyPass(); +} + +PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) { + FunctionAnalysisManager &FAM = + MAM.getResult(M).getManager(); + auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; + auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + + runCGProfilePass(M, GetBFI, GetTTI, false); + + return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp --- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -112,6 +112,7 @@ initializePGOInstrumentationUseLegacyPassPass(Registry); initializePGOIndirectCallPromotionLegacyPassPass(Registry); initializePGOMemOPSizeOptLegacyPassPass(Registry); + initializeCGProfileLegacyPassPass(Registry); initializeInstrOrderFileLegacyPassPass(Registry); initializeInstrProfilingLegacyPassPass(Registry); initializeMemorySanitizerLegacyPassPass(Registry); diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -276,6 +276,12 @@ ; GCN-O1-NEXT: Warn about non-applied transformations ; GCN-O1-NEXT: Alignment from assumptions ; GCN-O1-NEXT: Strip Unused Function Prototypes +; GCN-O1-NEXT: Call Graph Profile +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Natural Loop Information @@ -623,6 +629,12 @@ ; GCN-O2-NEXT: Strip Unused Function Prototypes ; GCN-O2-NEXT: Dead Global Elimination ; GCN-O2-NEXT: Merge Duplicate Global Constants +; GCN-O2-NEXT: Call Graph Profile +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Natural Loop Information @@ -975,6 +987,12 @@ ; GCN-O3-NEXT: Strip Unused Function Prototypes ; GCN-O3-NEXT: Dead Global Elimination ; GCN-O3-NEXT: Merge Duplicate Global Constants +; GCN-O3-NEXT: Call Graph Profile +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Natural Loop Information diff --git a/llvm/test/Instrumentation/cgprofile.ll b/llvm/test/Instrumentation/cgprofile.ll --- a/llvm/test/Instrumentation/cgprofile.ll +++ b/llvm/test/Instrumentation/cgprofile.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -passes cg-profile -S | FileCheck %s +; RUN: opt < %s -cg-profile -S | FileCheck %s declare void @b() diff --git a/llvm/test/Other/new-pm-cgprofile.ll b/llvm/test/Other/new-pm-cgprofile.ll deleted file mode 100644 --- a/llvm/test/Other/new-pm-cgprofile.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: opt -debug-pass-manager -passes='default' %s 2>&1 |FileCheck %s --check-prefixes=DEFAULT -; RUN: opt -debug-pass-manager -passes='default' -enable-npm-call-graph-profile=0 %s 2>&1 |FileCheck %s --check-prefixes=OFF -; RUN: opt -debug-pass-manager -passes='default' -enable-npm-call-graph-profile=1 %s 2>&1 |FileCheck %s --check-prefixes=ON -; -; DEFAULT: Running pass: CGProfilePass -; OFF-NOT: Running pass: CGProfilePass -; ON: Running pass: CGProfilePass - -define void @foo() { - ret void -} diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -280,6 +280,12 @@ ; CHECK-NEXT: Strip Unused Function Prototypes ; CHECK-NEXT: Dead Global Elimination ; CHECK-NEXT: Merge Duplicate Global Constants +; CHECK-NEXT: Call Graph Profile +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -285,6 +285,12 @@ ; CHECK-NEXT: Strip Unused Function Prototypes ; CHECK-NEXT: Dead Global Elimination ; CHECK-NEXT: Merge Duplicate Global Constants +; CHECK-NEXT: Call Graph Profile +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -266,6 +266,12 @@ ; CHECK-NEXT: Strip Unused Function Prototypes ; CHECK-NEXT: Dead Global Elimination ; CHECK-NEXT: Merge Duplicate Global Constants +; CHECK-NEXT: Call Graph Profile +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information