Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -129,6 +129,10 @@ uint64_t getColdCountThreshold() { return ColdCountThreshold ? ColdCountThreshold.getValue() : 0; } + /// Returns true if function \p F and/or basic block \p BB are suggested to be + /// size-optimized base on the profile. + bool shouldOptimizeForSize(Function *F, BasicBlock *BB, + BlockFrequencyInfo *BFI); }; /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo. Index: include/llvm/Transforms/Scalar/ConstantHoisting.h =================================================================== --- include/llvm/Transforms/Scalar/ConstantHoisting.h +++ include/llvm/Transforms/Scalar/ConstantHoisting.h @@ -55,6 +55,7 @@ class Function; class GlobalVariable; class Instruction; +class ProfileSummaryInfo; class TargetTransformInfo; /// A private "module" namespace for types and utilities used by @@ -124,7 +125,8 @@ // Glue for old PM. bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - BlockFrequencyInfo *BFI, BasicBlock &Entry); + BlockFrequencyInfo *BFI, BasicBlock &Entry, + ProfileSummaryInfo *PSI); void cleanup() { ClonedCastMap.clear(); @@ -148,6 +150,7 @@ LLVMContext *Ctx; const DataLayout *DL; BasicBlock *Entry; + ProfileSummaryInfo *PSI; /// Keeps track of constant candidates found in the function. using ConstCandVecType = std::vector; Index: include/llvm/Transforms/Utils/SimplifyLibCalls.h =================================================================== --- include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -28,6 +28,8 @@ class BasicBlock; class Function; class OptimizationRemarkEmitter; +class BlockFrequencyInfo; +class ProfileSummaryInfo; /// This class implements simplifications for calls to fortified library /// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to, @@ -74,6 +76,8 @@ const DataLayout &DL; const TargetLibraryInfo *TLI; OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; bool UnsafeFPShrink; function_ref Replacer; function_ref Eraser; @@ -101,6 +105,7 @@ LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref Replacer = &replaceAllUsesWithDefault, function_ref Eraser = &eraseFromParentDefault); Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -24,11 +24,13 @@ class AssumptionCache; class BasicBlock; +class BlockFrequencyInfo; class DependenceInfo; class DominatorTree; class Loop; class LoopInfo; class MDNode; +class ProfileSummaryInfo; class OptimizationRemarkEmitter; class ScalarEvolution; @@ -120,7 +122,8 @@ MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name); TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( - Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserAllowPeeling); Index: include/llvm/Transforms/Vectorize/LoopVectorize.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorize.h +++ include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -71,6 +71,7 @@ class LoopAccessInfo; class LoopInfo; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class ScalarEvolution; class TargetLibraryInfo; class TargetTransformInfo; @@ -96,6 +97,7 @@ AssumptionCache *AC; std::function *GetLAA; OptimizationRemarkEmitter *ORE; + ProfileSummaryInfo *PSI; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -105,7 +107,7 @@ BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function &GetLAA_, - OptimizationRemarkEmitter &ORE); + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_); bool processLoop(Loop *L); }; Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -57,6 +57,10 @@ cl::desc("A fixed cold count that overrides the count derived from" " profile-summary-cutoff-cold")); +static cl::opt ProfileGuidedSizeOpt( + "pgso", cl::Hidden, cl::init(true), + cl::desc("Enable the profile guided size optimization. ")); + // Find the summary entry for a desired percentile of counts. static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS, uint64_t Percentile) { @@ -323,3 +327,12 @@ } char ProfileSummaryInfoWrapperPass::ID = 0; + +bool ProfileSummaryInfo::shouldOptimizeForSize(Function *F, BasicBlock *BB, + BlockFrequencyInfo *BFI) { + if (!hasProfileSummary()) + return false; + if (BB && BFI) + return ProfileGuidedSizeOpt && isColdBlock(BB, BFI); + return ProfileGuidedSizeOpt && isFunctionEntryCold(F); +} Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -574,8 +574,12 @@ Options.DoCounterPromotion = true; Options.UseBFIInPromotion = IsCS; MPM.addPass(InstrProfiling(Options, IsCS)); - } else if (!ProfileFile.empty()) + } else if (!ProfileFile.empty()) { MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); + } } static InlineParams @@ -648,6 +652,9 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, Phase == ThinLTOPhase::PreLink)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the ThinLTO backend. if (Phase != ThinLTOPhase::PreLink) @@ -1064,6 +1071,9 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, false /* ThinLTOPhase::PreLink */)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); } // Remove unused virtual tables to improve the quality of code generated by Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4098,7 +4098,7 @@ auto InstCombineErase = [this](Instruction *I) { eraseInstFromFunction(*I); }; - LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW, + LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, InstCombineErase); if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -52,12 +52,14 @@ class APInt; class AssumptionCache; +class BlockFrequencyInfo; class DataLayout; class DominatorTree; class GEPOperator; class GlobalVariable; class LoopInfo; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class TargetLibraryInfo; class User; @@ -304,6 +306,8 @@ const DataLayout &DL; const SimplifyQuery SQ; OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; // Optional analyses. When non-null, these can both be used to do better // combining and will be updated to reflect any changes. @@ -315,11 +319,11 @@ InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, const DataLayout &DL, - LoopInfo *LI) + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), - DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {} + DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} /// Run the combiner over the entire worklist until it is empty. /// Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -46,14 +46,17 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3437,7 +3440,8 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, bool ExpensiveCombines = true, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, LoopInfo *LI = nullptr) { auto &DL = F.getParent()->getDataLayout(); ExpensiveCombines |= EnableExpensiveCombines; @@ -3468,7 +3472,7 @@ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, Builder, F.optForMinSize(), ExpensiveCombines, AA, - AC, TLI, DT, ORE, DL, LI); + AC, TLI, DT, ORE, BFI, PSI, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; if (!IC.run()) @@ -3488,8 +3492,15 @@ auto *LI = AM.getCachedResult(F); auto *AA = &AM.getResult(F); + const ModuleAnalysisManager &MAM = + AM.getResult(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - ExpensiveCombines, LI)) + BFI, PSI, ExpensiveCombines, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3513,6 +3524,8 @@ AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); } bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3529,9 +3542,15 @@ // Optional analyses. auto *LIWP = getAnalysisIfAvailable(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + ProfileSummaryInfo *PSI = + &getAnalysis().getPSI(); + BlockFrequencyInfo *BFI = + (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - ExpensiveCombines, LI); + BFI, PSI, ExpensiveCombines, LI); } char InstructionCombiningPass::ID = 0; @@ -3544,6 +3563,8 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) Index: lib/Transforms/Scalar/ConstantHoisting.cpp =================================================================== --- lib/Transforms/Scalar/ConstantHoisting.cpp +++ lib/Transforms/Scalar/ConstantHoisting.cpp @@ -41,6 +41,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -111,6 +112,7 @@ if (ConstHoistWithBlockFrequency) AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); } @@ -126,6 +128,7 @@ "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) @@ -148,7 +151,8 @@ ConstHoistWithBlockFrequency ? &getAnalysis().getBFI() : nullptr, - Fn.getEntryBlock()); + Fn.getEntryBlock(), + &getAnalysis().getPSI()); if (MadeChange) { LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -548,7 +552,10 @@ ConstCandVecType::iterator &MaxCostItr) { unsigned NumUses = 0; - if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) { + bool OptForSize = + Entry->getParent()->optForSize() || + (PSI && PSI->shouldOptimizeForSize(Entry->getParent(), nullptr, nullptr)); + if (!OptForSize || std::distance(S,E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) @@ -919,13 +926,14 @@ /// Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, DominatorTree &DT, BlockFrequencyInfo *BFI, - BasicBlock &Entry) { + BasicBlock &Entry, ProfileSummaryInfo *PSI) { this->TTI = &TTI; this->DT = &DT; this->BFI = BFI; this->DL = &Fn.getParent()->getDataLayout(); this->Ctx = &Fn.getContext(); this->Entry = &Entry; + this->PSI = PSI; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -962,7 +970,9 @@ auto BFI = ConstHoistWithBlockFrequency ? &AM.getResult(F) : nullptr; - if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock())) + auto &MAM = AM.getResult(F).getManager(); + auto *PSI = MAM.getCachedResult(*F.getParent()); + if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI)) return PreservedAnalyses::all(); PreservedAnalyses PA; Index: lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- lib/Transforms/Scalar/LoopLoadElimination.cpp +++ lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -29,11 +29,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -159,8 +162,9 @@ class LoadEliminationForLoop { public: LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, - DominatorTree *DT) - : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {} + DominatorTree *DT, BlockFrequencyInfo *BFI, + ProfileSummaryInfo* PSI) + : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {} /// Look through the loop-carried and loop-independent dependences in /// this loop and find store->load dependences. @@ -529,7 +533,11 @@ } if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { - if (L->getHeader()->getParent()->optForSize()) { + bool OptForSize = + L->getHeader()->getParent()->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI)); + if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " "for size.\n"); @@ -572,6 +580,8 @@ LoopInfo *LI; const LoopAccessInfo &LAI; DominatorTree *DT; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; PredicatedScalarEvolution PSE; }; @@ -579,6 +589,7 @@ static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref GetLAI) { // Build up a worklist of inner-loops to transform to avoid iterator // invalidation. @@ -597,7 +608,7 @@ bool Changed = false; for (Loop *L : Worklist) { // The actual work is performed by LoadEliminationForLoop. - LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT); + LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI); Changed |= LEL.processLoop(); } return Changed; @@ -622,10 +633,14 @@ auto &LI = getAnalysis().getLoopInfo(); auto &LAA = getAnalysis(); auto &DT = getAnalysis().getDomTree(); + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // Process each loop nest in the function. return eliminateLoadsAcrossLoops( - F, LI, DT, + F, LI, DT, BFI, PSI, [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); } @@ -638,6 +653,8 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); } }; @@ -653,6 +670,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) FunctionPass *llvm::createLoopLoadEliminationPass() { @@ -668,13 +687,17 @@ auto &TLI = AM.getResult(F); auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); + auto &MAM = AM.getResult(F).getManager(); + auto *PSI = MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; MemorySSA *MSSA = EnableMSSALoopDependency ? &AM.getResult(F).getMSSA() : nullptr; auto &LAM = AM.getResult(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( - F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & { + F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }); Index: lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -294,7 +294,8 @@ return LoopUnrollResult::Unmodified; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, OptLevel, None, None, None, None, None, None); + L, SE, TTI, nullptr, nullptr, OptLevel, + None, None, None, None, None, None); if (AllowUnrollAndJam.getNumOccurrences() > 0) UP.UnrollAndJam = AllowUnrollAndJam; if (UnrollAndJamThreshold.getNumOccurrences() > 0) Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -23,7 +23,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -165,7 +167,8 @@ /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( - Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserAllowPeeling) { @@ -198,7 +201,12 @@ TTI.getUnrollingPreferences(L, SE, UP); // Apply size attributes - if (L->getHeader()->getParent()->optForSize()) { + bool OptForSize = + L->getHeader()->getParent()->optForSize() || + (PSI && BFI && + PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI)); + if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; } @@ -963,7 +971,9 @@ static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, + OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, Optional ProvidedRuntime, Optional ProvidedUpperBound, @@ -989,7 +999,7 @@ bool NotDuplicatable; bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount, + L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); // Exit early if unrolling is disabled. @@ -1170,7 +1180,8 @@ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced, + L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, + PreserveLCSSA, OptLevel, OnlyWhenForced, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); @@ -1249,6 +1260,7 @@ bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*BFI*/ nullptr, /*PSI*/ nullptr, /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, /*AllowPartial*/ false, @@ -1351,6 +1363,8 @@ AM.getResult(F).getManager(); ProfileSummaryInfo *PSI = MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; bool Changed = false; @@ -1386,7 +1400,7 @@ // The API here is quite complex to call and we allow to select some // flavors of unrolling during construction time (by setting UnrollOpts). LoopUnrollResult Result = tryToUnrollLoop( - &L, DT, &LI, SE, TTI, AC, ORE, + &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -16,8 +16,10 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" @@ -2327,7 +2329,11 @@ // Don't rewrite fputs to fwrite when optimising for size because fwrite // requires more arguments and thus extra MOVs are required. - if (CI->getFunction()->optForSize()) + bool OptForSize = + CI->getFunction()->optForSize() || + (PSI && BFI && + PSI->shouldOptimizeForSize(CI->getFunction(), CI->getParent(), BFI)); + if (OptForSize) return nullptr; // Check if has any use @@ -2702,9 +2708,10 @@ LibCallSimplifier::LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref Replacer, function_ref Eraser) - : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), + : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI), UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {} void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -88,6 +88,7 @@ #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -1458,12 +1459,13 @@ auto *LAA = &getAnalysis(); auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); + auto *PSI = &getAnalysis().getPSI(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE); + GetLAA, *ORE, PSI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1489,6 +1491,7 @@ AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); } }; @@ -6042,6 +6045,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -7115,7 +7119,8 @@ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); @@ -7133,7 +7138,10 @@ // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI))); // Plan how to best vectorize, return the best VF and its cost. VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); @@ -7214,7 +7222,10 @@ // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI))); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7223,7 +7234,7 @@ // pipeline. if (!L->empty()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, Hints); + ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip @@ -7489,7 +7500,7 @@ DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function &GetLAA_, - OptimizationRemarkEmitter &ORE_) { + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; @@ -7501,6 +7512,7 @@ GetLAA = &GetLAA_; DB = &DB_; ORE = &ORE_; + PSI = PSI_; // Don't attempt if // 1. the target claims to have no vector registers, and @@ -7569,8 +7581,12 @@ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }; + const ModuleAnalysisManager &MAM = + AM.getResult(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult(*F.getParent()); bool Changed = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE); + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; Index: test/Other/new-pm-defaults.ll =================================================================== --- test/Other/new-pm-defaults.ll +++ test/Other/new-pm-defaults.ll @@ -106,6 +106,7 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. @@ -245,7 +246,6 @@ ; CHECK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis Index: test/Other/new-pm-lto-defaults.ll =================================================================== --- test/Other/new-pm-lto-defaults.ll +++ test/Other/new-pm-lto-defaults.ll @@ -69,6 +69,7 @@ ; CHECK-O2-NEXT: Starting llvm::Function pass manager run. ; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass ; CHECK-O2-NEXT: Running pass: InstCombinePass +; CHECK-O2-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass ; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass> Index: test/Other/new-pm-thinlto-defaults.ll =================================================================== --- test/Other/new-pm-thinlto-defaults.ll +++ test/Other/new-pm-thinlto-defaults.ll @@ -88,6 +88,7 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA @@ -219,7 +220,6 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis Index: test/Other/opt-O2-pipeline.ll =================================================================== --- test/Other/opt-O2-pipeline.ll +++ test/Other/opt-O2-pipeline.ll @@ -214,6 +214,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results Index: test/Other/opt-O3-pipeline.ll =================================================================== --- test/Other/opt-O3-pipeline.ll +++ test/Other/opt-O3-pipeline.ll @@ -219,6 +219,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results Index: test/Other/opt-Os-pipeline.ll =================================================================== --- test/Other/opt-Os-pipeline.ll +++ test/Other/opt-Os-pipeline.ll @@ -201,6 +201,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results