Index: include/llvm/Analysis/LoopAnalysisManager.h =================================================================== --- include/llvm/Analysis/LoopAnalysisManager.h +++ include/llvm/Analysis/LoopAnalysisManager.h @@ -152,6 +152,11 @@ LoopStandardAnalysisResults &> FunctionAnalysisManagerLoopProxy; +/// A proxy from a \c ModuleAnalysisManager to a \c Loop. +typedef OuterAnalysisManagerProxy + ModuleAnalysisManagerLoopProxy; + /// Returns the minimum set of Analyses that all loop passes must preserve. PreservedAnalyses getLoopPassPreservedAnalyses(); } Index: include/llvm/Passes/PassBuilder.h =================================================================== --- include/llvm/Passes/PassBuilder.h +++ include/llvm/Passes/PassBuilder.h @@ -63,6 +63,7 @@ TargetMachine *TM; Optional PGOOpt; PassInstrumentationCallbacks *PIC; + Optional ModuleHasProfileSummary; // True for ThinLTO post-link + PGO. public: /// A struct to capture parsed pass pipeline names. @@ -178,8 +179,10 @@ explicit PassBuilder(TargetMachine *TM = nullptr, Optional PGOOpt = None, - PassInstrumentationCallbacks *PIC = nullptr) - : TM(TM), PGOOpt(PGOOpt), PIC(PIC) {} + PassInstrumentationCallbacks *PIC = nullptr, + Optional ModuleHasProfileSummary = None) + : TM(TM), PGOOpt(PGOOpt), PIC(PIC), + ModuleHasProfileSummary(ModuleHasProfileSummary) {} /// Cross register the analysis managers through their proxies. /// Index: lib/LTO/LTOBackend.cpp =================================================================== --- lib/LTO/LTOBackend.cpp +++ lib/LTO/LTOBackend.cpp @@ -157,7 +157,7 @@ PGOOpt = PGOOptions("", "", Conf.SampleProfile, Conf.ProfileRemapping, false, true); - PassBuilder PB(TM, PGOOpt); + PassBuilder PB(TM, PGOOpt, nullptr, Mod.getProfileSummary() != nullptr); AAManager AA; // Parse a custom AA pipeline if asked to. Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -416,6 +416,12 @@ // minimal multiplication trees. FPM.addPass(ReassociatePass()); + bool HasProfile = (PGOOpt && + (!PGOOpt->SampleProfileFile.empty() || + !PGOOpt->ProfileUseFile.empty())) || + (ModuleHasProfileSummary && // eg. ThinLTO post link. + ModuleHasProfileSummary.getValue()); + // Add the primary loop simplification pipeline. // FIXME: Currently this is split into two loop pass pipelines because we run // some function passes in between them. These can and should be removed @@ -425,7 +431,8 @@ // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to // fully replace `SimplifyCFGPass`, and the closest to the other we have is // `LoopInstSimplify`. - LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging); + LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging), + LPM3(DebugLogging); // Simplify the loop body. We do this initially to clean up after other loop // passes run, either when iterating on a loop or on inner loops with @@ -449,10 +456,11 @@ // inaccurate. if (Phase != ThinLTOPhase::PreLink || !PGOOpt || PGOOpt->SampleProfileFile.empty()) - LPM2.addPass(LoopFullUnrollPass(Level)); + // With profile, split into LPM3 so we can insert BFI. + (!HasProfile ? LPM2 : LPM3).addPass(LoopFullUnrollPass(Level)); for (auto &C : LoopOptimizerEndEPCallbacks) - C(LPM2, Level); + C(!HasProfile ? LPM2 : LPM3, Level); // We provide the opt remark emitter pass for LICM to use. We only need to do // this once as it is immutable. @@ -461,6 +469,11 @@ FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), DebugLogging)); + if (HasProfile) { + // With profile, require BFI for LoopFullUnrollPass. + FPM.addPass(RequireAnalysisPass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM3), DebugLogging)); + } // Eliminate redundancies. if (Level != O1) { @@ -573,8 +586,10 @@ MPM.addPass(InstrProfiling(Options, false)); } - if (!ProfileUseFile.empty()) + if (!ProfileUseFile.empty()) { MPM.addPass(PGOInstrumentationUse(ProfileUseFile, ProfileRemappingFile)); + MPM.addPass(RequireAnalysisPass()); + } } static InlineParams @@ -647,6 +662,7 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile, PGOOpt->ProfileRemappingFile, Phase == ThinLTOPhase::PreLink)); + MPM.addPass(RequireAnalysisPass()); // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the ThinLTO backend. if (Phase != ThinLTOPhase::PreLink) @@ -1040,6 +1056,7 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile, PGOOpt->ProfileRemappingFile, false /* ThinLTOPhase::PreLink */)); + MPM.addPass(RequireAnalysisPass()); } // Remove unused virtual tables to improve the quality of code generated by @@ -1980,6 +1997,7 @@ FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); }); FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); }); LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); + LAM.registerPass([&] { return ModuleAnalysisManagerLoopProxy(MAM); }); } Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -963,7 +964,9 @@ static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, + OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, Optional ProvidedRuntime, Optional ProvidedUpperBound, @@ -985,6 +988,10 @@ if (OnlyWhenForced && !(TM & TM_Enable)) return LoopUnrollResult::Unmodified; + // + // Do some profile guided optimizations using BFI and PSI here. + // + unsigned NumInlineCandidates; bool NotDuplicatable; bool Convergent; @@ -1170,7 +1177,8 @@ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced, + L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, + PreserveLCSSA, OptLevel, OnlyWhenForced, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); @@ -1236,6 +1244,10 @@ "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); + auto *BFI = FAM.getCachedResult(*F); + auto &MAM = AM.getResult(L, AR).getManager(); + auto *PSI = MAM.getCachedResult(*F->getParent()); + // Keep track of the previous loop structure so we can identify new loops // created by unrolling. Loop *ParentL = L.getParentLoop(); @@ -1248,7 +1260,7 @@ std::string LoopName = L.getName(); bool Changed = - tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, BFI, PSI, /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, /*AllowPartial*/ false, @@ -1386,7 +1398,7 @@ // The API here is quite complex to call and we allow to select some // flavors of unrolling during construction time (by setting UnrollOpts). LoopUnrollResult Result = tryToUnrollLoop( - &L, DT, &LI, SE, TTI, AC, ORE, + &L, DT, &LI, SE, TTI, AC, ORE, nullptr, nullptr, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,