Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -129,6 +129,10 @@ uint64_t getColdCountThreshold() { return ColdCountThreshold ? ColdCountThreshold.getValue() : 0; } + /// Returns true if function \p F and/or basic block \p BB are suggested to be + /// size-optimized base on the profile. + bool shouldOptimizeForSize(Function *F, BasicBlock *BB, + BlockFrequencyInfo *BFI); }; /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo. Index: include/llvm/Transforms/Scalar/ConstantHoisting.h =================================================================== --- include/llvm/Transforms/Scalar/ConstantHoisting.h +++ include/llvm/Transforms/Scalar/ConstantHoisting.h @@ -55,6 +55,7 @@ class Function; class GlobalVariable; class Instruction; +class ProfileSummaryInfo; class TargetTransformInfo; /// A private "module" namespace for types and utilities used by @@ -124,7 +125,8 @@ // Glue for old PM. bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - BlockFrequencyInfo *BFI, BasicBlock &Entry); + BlockFrequencyInfo *BFI, BasicBlock &Entry, + ProfileSummaryInfo *PSI); void cleanup() { ClonedCastMap.clear(); @@ -148,6 +150,7 @@ LLVMContext *Ctx; const DataLayout *DL; BasicBlock *Entry; + ProfileSummaryInfo *PSI; /// Keeps track of constant candidates found in the function. using ConstCandVecType = std::vector; Index: include/llvm/Transforms/Utils/SimplifyLibCalls.h =================================================================== --- include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -28,6 +28,8 @@ class BasicBlock; class Function; class OptimizationRemarkEmitter; +class BlockFrequencyInfo; +class ProfileSummaryInfo; /// This class implements simplifications for calls to fortified library /// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to, @@ -74,6 +76,8 @@ const DataLayout &DL; const TargetLibraryInfo *TLI; OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; bool UnsafeFPShrink; function_ref Replacer; function_ref Eraser; @@ -101,6 +105,7 @@ LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref Replacer = &replaceAllUsesWithDefault, function_ref Eraser = &eraseFromParentDefault); Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -24,11 +24,13 @@ class AssumptionCache; class BasicBlock; +class BlockFrequencyInfo; class DependenceInfo; class DominatorTree; class Loop; class LoopInfo; class MDNode; +class ProfileSummaryInfo; class OptimizationRemarkEmitter; class ScalarEvolution; @@ -120,7 +122,8 @@ MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name); TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( - Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserAllowPeeling); Index: include/llvm/Transforms/Vectorize/LoopVectorize.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorize.h +++ include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -71,6 +71,7 @@ class LoopAccessInfo; class LoopInfo; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class ScalarEvolution; class TargetLibraryInfo; class TargetTransformInfo; @@ -96,6 +97,7 @@ AssumptionCache *AC; std::function *GetLAA; OptimizationRemarkEmitter *ORE; + ProfileSummaryInfo *PSI; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -105,7 +107,7 @@ BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function &GetLAA_, - OptimizationRemarkEmitter &ORE); + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_); bool processLoop(Loop *L); }; Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -57,6 +57,10 @@ cl::desc("A fixed cold count that overrides the count derived from" " profile-summary-cutoff-cold")); +static cl::opt ProfileGuidedSizeOpt( + "pgso", cl::Hidden, cl::init(true), + cl::desc("Enable the profile guided size optimization. ")); + // Find the summary entry for a desired percentile of counts. static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS, uint64_t Percentile) { @@ -323,3 +327,13 @@ } char ProfileSummaryInfoWrapperPass::ID = 0; + +bool ProfileSummaryInfo::shouldOptimizeForSize(Function *F, BasicBlock *BB, + BlockFrequencyInfo *BFI) { + assert(BFI); + if (!hasProfileSummary()) + return false; + if (BB) + return ProfileGuidedSizeOpt && isColdBlock(BB, BFI); + return ProfileGuidedSizeOpt && isFunctionColdInCallGraph(F, *BFI); +} Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -574,8 +574,12 @@ Options.DoCounterPromotion = true; Options.UseBFIInPromotion = IsCS; MPM.addPass(InstrProfiling(Options, IsCS)); - } else if (!ProfileFile.empty()) + } else if (!ProfileFile.empty()) { MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); + } } static InlineParams @@ -648,6 +652,9 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, Phase == ThinLTOPhase::PreLink)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the ThinLTO backend. if (Phase != ThinLTOPhase::PreLink) @@ -1064,6 +1071,9 @@ MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, false /* ThinLTOPhase::PreLink */)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); } // Remove unused virtual tables to improve the quality of code generated by Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4098,7 +4098,7 @@ auto InstCombineErase = [this](Instruction *I) { eraseInstFromFunction(*I); }; - LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW, + LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, InstCombineErase); if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -52,12 +52,14 @@ class APInt; class AssumptionCache; +class BlockFrequencyInfo; class DataLayout; class DominatorTree; class GEPOperator; class GlobalVariable; class LoopInfo; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class TargetLibraryInfo; class User; @@ -304,6 +306,8 @@ const DataLayout &DL; const SimplifyQuery SQ; OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; // Optional analyses. When non-null, these can both be used to do better // combining and will be updated to reflect any changes. @@ -315,11 +319,11 @@ InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, const DataLayout &DL, - LoopInfo *LI) + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), - DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {} + DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} /// Run the combiner over the entire worklist until it is empty. /// Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -46,14 +46,17 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3437,7 +3440,8 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, bool ExpensiveCombines = true, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, LoopInfo *LI = nullptr) { auto &DL = F.getParent()->getDataLayout(); ExpensiveCombines |= EnableExpensiveCombines; @@ -3468,7 +3472,7 @@ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, Builder, F.optForMinSize(), ExpensiveCombines, AA, - AC, TLI, DT, ORE, DL, LI); + AC, TLI, DT, ORE, BFI, PSI, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; if (!IC.run()) @@ -3488,8 +3492,15 @@ auto *LI = AM.getCachedResult(F); auto *AA = &AM.getResult(F); + const ModuleAnalysisManager &MAM = + AM.getResult(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - ExpensiveCombines, LI)) + BFI, PSI, ExpensiveCombines, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3513,6 +3524,8 @@ AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); } bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3529,9 +3542,15 @@ // Optional analyses. auto *LIWP = getAnalysisIfAvailable(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + ProfileSummaryInfo *PSI = + &getAnalysis().getPSI(); + BlockFrequencyInfo *BFI = + (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - ExpensiveCombines, LI); + BFI, PSI, ExpensiveCombines, LI); } char InstructionCombiningPass::ID = 0; @@ -3544,6 +3563,8 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) Index: lib/Transforms/Scalar/ConstantHoisting.cpp =================================================================== --- lib/Transforms/Scalar/ConstantHoisting.cpp +++ lib/Transforms/Scalar/ConstantHoisting.cpp @@ -41,6 +41,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -111,6 +112,7 @@ if (ConstHoistWithBlockFrequency) AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); } @@ -126,6 +128,7 @@ "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) @@ -148,7 +151,8 @@ ConstHoistWithBlockFrequency ? &getAnalysis().getBFI() : nullptr, - Fn.getEntryBlock()); + Fn.getEntryBlock(), + &getAnalysis().getPSI()); if (MadeChange) { LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -548,7 +552,11 @@ ConstCandVecType::iterator &MaxCostItr) { unsigned NumUses = 0; - if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) { + bool OptForSize = + Entry->getParent()->optForSize() || + (PSI && BFI && + PSI->shouldOptimizeForSize(Entry->getParent(), nullptr, BFI)); + if (!OptForSize || std::distance(S,E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) @@ -919,13 +927,14 @@ /// Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, DominatorTree &DT, BlockFrequencyInfo *BFI, - BasicBlock &Entry) { + BasicBlock &Entry, ProfileSummaryInfo *PSI) { this->TTI = &TTI; this->DT = &DT; this->BFI = BFI; this->DL = &Fn.getParent()->getDataLayout(); this->Ctx = &Fn.getContext(); this->Entry = &Entry; + this->PSI = PSI; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -962,7 +971,9 @@ auto BFI = ConstHoistWithBlockFrequency ? &AM.getResult(F) : nullptr; - if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock())) + auto &MAM = AM.getResult(F).getManager(); + auto *PSI = MAM.getCachedResult(*F.getParent()); + if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI)) return PreservedAnalyses::all(); PreservedAnalyses PA; Index: lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- lib/Transforms/Scalar/LoopLoadElimination.cpp +++ lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -29,11 +29,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -159,8 +162,9 @@ class LoadEliminationForLoop { public: LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, - DominatorTree *DT) - : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {} + DominatorTree *DT, BlockFrequencyInfo *BFI, + ProfileSummaryInfo* PSI) + : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {} /// Look through the loop-carried and loop-independent dependences in /// this loop and find store->load dependences. @@ -529,7 +533,11 @@ } if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { - if (L->getHeader()->getParent()->optForSize()) { + bool OptForSize = + L->getHeader()->getParent()->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI)); + if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " "for size.\n"); @@ -572,6 +580,8 @@ LoopInfo *LI; const LoopAccessInfo &LAI; DominatorTree *DT; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; PredicatedScalarEvolution PSE; }; @@ -579,6 +589,7 @@ static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref GetLAI) { // Build up a worklist of inner-loops to transform to avoid iterator // invalidation. @@ -597,7 +608,7 @@ bool Changed = false; for (Loop *L : Worklist) { // The actual work is performed by LoadEliminationForLoop. - LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT); + LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI); Changed |= LEL.processLoop(); } return Changed; @@ -622,10 +633,14 @@ auto &LI = getAnalysis().getLoopInfo(); auto &LAA = getAnalysis(); auto &DT = getAnalysis().getDomTree(); + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // Process each loop nest in the function. return eliminateLoadsAcrossLoops( - F, LI, DT, + F, LI, DT, BFI, PSI, [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); } @@ -638,6 +653,8 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); } }; @@ -653,6 +670,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) FunctionPass *llvm::createLoopLoadEliminationPass() { @@ -668,13 +687,17 @@ auto &TLI = AM.getResult(F); auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); + auto &MAM = AM.getResult(F).getManager(); + auto *PSI = MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; MemorySSA *MSSA = EnableMSSALoopDependency ? &AM.getResult(F).getMSSA() : nullptr; auto &LAM = AM.getResult(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( - F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & { + F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }); Index: lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -294,7 +294,8 @@ return LoopUnrollResult::Unmodified; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, OptLevel, None, None, None, None, None, None); + L, SE, TTI, nullptr, nullptr, OptLevel, + None, None, None, None, None, None); if (AllowUnrollAndJam.getNumOccurrences() > 0) UP.UnrollAndJam = AllowUnrollAndJam; if (UnrollAndJamThreshold.getNumOccurrences() > 0) Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -23,7 +23,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -165,7 +167,8 @@ /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( - Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserAllowPeeling) { @@ -198,7 +201,12 @@ TTI.getUnrollingPreferences(L, SE, UP); // Apply size attributes - if (L->getHeader()->getParent()->optForSize()) { + bool OptForSize = + L->getHeader()->getParent()->optForSize() || + (PSI && BFI && + PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI)); + if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; } @@ -963,7 +971,9 @@ static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel, + OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, Optional ProvidedRuntime, Optional ProvidedUpperBound, @@ -989,7 +999,7 @@ bool NotDuplicatable; bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount, + L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); // Exit early if unrolling is disabled. @@ -1170,7 +1180,8 @@ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced, + L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, + PreserveLCSSA, OptLevel, OnlyWhenForced, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); @@ -1249,6 +1260,7 @@ bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*BFI*/ nullptr, /*PSI*/ nullptr, /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, /*AllowPartial*/ false, @@ -1351,6 +1363,8 @@ AM.getResult(F).getManager(); ProfileSummaryInfo *PSI = MAM.getCachedResult(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult(F) : nullptr; bool Changed = false; @@ -1386,7 +1400,7 @@ // The API here is quite complex to call and we allow to select some // flavors of unrolling during construction time (by setting UnrollOpts). LoopUnrollResult Result = tryToUnrollLoop( - &L, DT, &LI, SE, TTI, AC, ORE, + &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -16,8 +16,10 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" @@ -2327,7 +2329,11 @@ // Don't rewrite fputs to fwrite when optimising for size because fwrite // requires more arguments and thus extra MOVs are required. - if (CI->getFunction()->optForSize()) + bool OptForSize = + CI->getFunction()->optForSize() || + (PSI && BFI && + PSI->shouldOptimizeForSize(CI->getFunction(), CI->getParent(), BFI)); + if (OptForSize) return nullptr; // Check if has any use @@ -2702,9 +2708,10 @@ LibCallSimplifier::LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, function_ref Replacer, function_ref Eraser) - : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), + : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI), UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {} void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -88,6 +88,7 @@ #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -1458,12 +1459,13 @@ auto *LAA = &getAnalysis(); auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); + auto *PSI = &getAnalysis().getPSI(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE); + GetLAA, *ORE, PSI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1489,6 +1491,7 @@ AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); } }; @@ -6042,6 +6045,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -7115,7 +7119,8 @@ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); @@ -7133,7 +7138,10 @@ // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI))); // Plan how to best vectorize, return the best VF and its cost. VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); @@ -7214,7 +7222,10 @@ // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->optForSize() || + (PSI && BFI && PSI->shouldOptimizeForSize(L->getHeader()->getParent(), + L->getHeader(), BFI))); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7223,7 +7234,7 @@ // pipeline. if (!L->empty()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, Hints); + ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip @@ -7489,7 +7500,7 @@ DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function &GetLAA_, - OptimizationRemarkEmitter &ORE_) { + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; @@ -7501,6 +7512,7 @@ GetLAA = &GetLAA_; DB = &DB_; ORE = &ORE_; + PSI = PSI_; // Don't attempt if // 1. the target claims to have no vector registers, and @@ -7569,8 +7581,12 @@ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }; + const ModuleAnalysisManager &MAM = + AM.getResult(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult(*F.getParent()); bool Changed = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE); + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; Index: test/Other/new-pm-defaults.ll =================================================================== --- test/Other/new-pm-defaults.ll +++ test/Other/new-pm-defaults.ll @@ -106,6 +106,7 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. @@ -245,7 +246,6 @@ ; CHECK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis Index: test/Other/new-pm-lto-defaults.ll =================================================================== --- test/Other/new-pm-lto-defaults.ll +++ test/Other/new-pm-lto-defaults.ll @@ -69,6 +69,7 @@ ; CHECK-O2-NEXT: Starting llvm::Function pass manager run. ; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass ; CHECK-O2-NEXT: Running pass: InstCombinePass +; CHECK-O2-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass ; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass> Index: test/Other/new-pm-thinlto-defaults.ll =================================================================== --- test/Other/new-pm-thinlto-defaults.ll +++ test/Other/new-pm-thinlto-defaults.ll @@ -88,6 +88,7 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA @@ -219,7 +220,6 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis Index: test/Other/opt-O2-pipeline.ll =================================================================== --- test/Other/opt-O2-pipeline.ll +++ test/Other/opt-O2-pipeline.ll @@ -214,6 +214,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results Index: test/Other/opt-O3-pipeline.ll =================================================================== --- test/Other/opt-O3-pipeline.ll +++ test/Other/opt-O3-pipeline.ll @@ -219,6 +219,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results Index: test/Other/opt-Os-pipeline.ll =================================================================== --- test/Other/opt-Os-pipeline.ll +++ test/Other/opt-Os-pipeline.ll @@ -201,6 +201,8 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results Index: test/Transforms/ConstantHoisting/ARM/const-addr-no-neg-offset.ll =================================================================== --- test/Transforms/ConstantHoisting/ARM/const-addr-no-neg-offset.ll +++ test/Transforms/ConstantHoisting/ARM/const-addr-no-neg-offset.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -S < %s | FileCheck %s +; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso -S < %s | FileCheck %s -check-prefix=PGSO ; There are different candidates here for the base constant: 1073876992 and ; 1073876996. But we don't want to see the latter because it results in @@ -40,3 +41,57 @@ } attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } + +define void @foo_pgso() #1 !prof !14 { +entry: +; PGSO-LABEL: @foo_pgso +; PGSO-NOT: [[CONST1:%const_mat[0-9]*]] = add i32 %const, -4 + %0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096 + %or = or i32 %0, 1 + store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096 + %1 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4 + %and = and i32 %1, -117506048 + store volatile i32 %and, i32* inttoptr (i32 1073876996 to i32*), align 4 + %2 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096 + %and1 = and i32 %2, -17367041 + store volatile i32 %and1, i32* inttoptr (i32 1073876996 to i32*), align 4096 + %3 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096 + %and2 = and i32 %3, -262145 + store volatile i32 %and2, i32* inttoptr (i32 1073876992 to i32*), align 4096 + %4 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4 + %and3 = and i32 %4, -8323073 + store volatile i32 %and3, i32* inttoptr (i32 1073876996 to i32*), align 4 + store volatile i32 10420224, i32* inttoptr (i32 1073877000 to i32*), align 8 + %5 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4096 + %or4 = or i32 %5, 65536 + store volatile i32 %or4, i32* inttoptr (i32 1073876996 to i32*), align 4096 + %6 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192 + %or6.i.i = or i32 %6, 16 + store volatile i32 %or6.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192 + %7 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192 + %and7.i.i = and i32 %7, -4 + store volatile i32 %and7.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192 + %8 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192 + %or8.i.i = or i32 %8, 2 + store volatile i32 %or8.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192 + ret void +} + +attributes #1 = { norecurse nounwind readnone uwtable } ; no optsize or minsize + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} Index: test/Transforms/InstCombine/fputs-opt-size.ll =================================================================== --- test/Transforms/InstCombine/fputs-opt-size.ll +++ test/Transforms/InstCombine/fputs-opt-size.ll @@ -2,6 +2,7 @@ ; because it requires more arguments and thus extra MOVs are required. ; ; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -pgso -S | FileCheck %s -check-prefix=PGSO %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } @@ -26,3 +27,31 @@ attributes #0 = { nounwind optsize } attributes #1 = { nounwind optsize } + +define i32 @main_pgso() local_unnamed_addr !prof !14 { +entry: +; PGSO-LABEL: @main_pgso( +; PGSO-NOT: call i64 @fwrite +; PGSO: call i32 @fputs + + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0)) #2 + %call1 = tail call i32 @fputs(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.2, i32 0, i32 0), %struct._IO_FILE* %call) #2 + ret i32 0 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} Index: test/Transforms/LoopLoadElim/opt-size.ll =================================================================== --- test/Transforms/LoopLoadElim/opt-size.ll +++ test/Transforms/LoopLoadElim/opt-size.ll @@ -1,4 +1,5 @@ ; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s +; RUN: opt -basicaa -loop-load-elim -pgso -S < %s | FileCheck %s -check-prefix=PGSO ; When optimizing for size don't eliminate in this loop because the loop would ; have to be versioned first because A and C may alias. @@ -74,3 +75,83 @@ for.end: ; preds = %for.body ret void } + + +; PGSO-LABEL: @f_pgso( +define void @f_pgso(i32* %A, i32* %B, i32* %C, i64 %N) !prof !14 { + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %Aidx_next, align 4 + + %a = load i32, i32* %Aidx, align 4 +; PGSO: %c = mul i32 %a, 2 + %c = mul i32 %a, 2 + store i32 %c, i32* %Cidx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; PGSO-LABEL: @g_pgso( +define void @g_pgso(i32* noalias %A, i32* %B, i32* noalias %C, i64 %N) !prof !14 { + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %Aidx_next, align 4 + + %a = load i32, i32* %Aidx, align 4 +; PGSO: %c = mul i32 %store_forwarded, 2 + %c = mul i32 %a, 2 + store i32 %c, i32* %Cidx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} Index: test/Transforms/LoopUnroll/unroll-opt-attribute.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-opt-attribute.ll +++ test/Transforms/LoopUnroll/unroll-opt-attribute.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -S -loop-unroll -unroll-count=4 | FileCheck -check-prefix=CHECK_COUNT4 %s ; RUN: opt < %s -S -loop-unroll | FileCheck -check-prefix=CHECK_NOCOUNT %s +; RUN: opt < %s -S -passes='require,function(unroll)' -pgso | FileCheck -check-prefix=PGSO %s ;///////////////////// TEST 1 ////////////////////////////// @@ -128,3 +129,44 @@ ; CHECK_NOCOUNT-LABEL: @Test4 ; CHECK_NOCOUNT: phi ; CHECK_NOCOUNT: icmp + +;///////////////////// TEST 5 ////////////////////////////// + +; This test shows that with PGO, this loop is cold and not unrolled. + +define i32 @Test5() !prof !14 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [24 x i32], [24 x i32]* @tab, i32 0, i32 %i.05 + store i32 %i.05, i32* %arrayidx, align 4 + %inc = add nuw nsw i32 %i.05, 1 + %exitcond = icmp eq i32 %inc, 24 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 42 +} + +; PGSO-LABEL: @Test5 +; PGSO: phi +; PGSO: icmp + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} Index: test/Transforms/LoopVectorize/optsize.ll =================================================================== --- test/Transforms/LoopVectorize/optsize.ll +++ test/Transforms/LoopVectorize/optsize.ll @@ -2,6 +2,7 @@ ; loop with the optimize for size or the minimize size attributes. ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" @@ -57,3 +58,42 @@ attributes #1 = { minsize } +define i32 @foo_pgso() !prof !14 { +; PGSO-LABEL: @foo_pgso( +; PGSO-NOT: <2 x i8> +; PGSO-NOT: <4 x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0}