diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -48,6 +48,7 @@ class GlobalValue; class GlobalVariable; class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineConstantPoolValue; class MachineDominatorTree; class MachineFunction; @@ -69,6 +70,7 @@ class MCTargetOptions; class MDNode; class Module; +class ProfileSummaryInfo; class raw_ostream; class RemarkStreamer; class StackMaps; @@ -108,6 +110,10 @@ /// Optimization remark emitter. MachineOptimizationRemarkEmitter *ORE; + MachineBlockFrequencyInfo *MBFI; + + ProfileSummaryInfo *PSI; + /// The symbol for the current function. This is recalculated at the beginning /// of each call to runOnMachineFunction(). MCSymbol *CurrentFnSym = nullptr; diff --git a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h --- a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h +++ b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h @@ -182,6 +182,10 @@ } } + MachineBlockFrequencyInfo *getBFI() { + return MBFI; + } + private: MachineFunction &MF; diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h --- a/llvm/include/llvm/CodeGen/TailDuplicator.h +++ b/llvm/include/llvm/CodeGen/TailDuplicator.h @@ -25,11 +25,13 @@ namespace llvm { class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineBranchProbabilityInfo; class MachineFunction; class MachineInstr; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class TargetRegisterInfo; /// Utility class to perform tail duplication. @@ -40,6 +42,8 @@ const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; MachineFunction *MF; + const MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; bool PreRegAlloc; bool LayoutMode; unsigned TailDupSize; @@ -65,6 +69,8 @@ /// default implies using the command line value TailDupSize. void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, + const MachineBlockFrequencyInfo *MBFI, + ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize = 0); bool tailDuplicateBlocks(); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -31,13 +31,16 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -52,6 +55,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -248,6 +252,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } bool AsmPrinter::doInitialization(Module &M) { @@ -1684,6 +1690,13 @@ } ORE = &getAnalysis().getORE(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + // ORE conditionally computes MBFI. If available, use it, otherwise + // request it. + (ORE->getBFI() ? ORE->getBFI() : + &getAnalysis().getBFI()) : + nullptr; } namespace { @@ -2913,8 +2926,10 @@ void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB, MCCodePaddingContext &Context) const { assert(MF != nullptr && "Machine function must be valid"); + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); Context.IsPaddingActive = !MF->hasInlineAsm() && - !MF->getFunction().hasOptSize() && + !OptForSize && TM.getOptLevel() != CodeGenOpt::None; Context.IsBasicBlockReachableViaFallthrough = std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) != diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h --- a/llvm/lib/CodeGen/BranchFolding.h +++ b/llvm/lib/CodeGen/BranchFolding.h @@ -27,6 +27,7 @@ class MachineLoopInfo; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class raw_ostream; class TargetInstrInfo; class TargetRegisterInfo; @@ -39,6 +40,7 @@ bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, // Min tail length to merge. Defaults to commandline // flag. Ignored for optsize. unsigned MinTailLength = 0); @@ -145,6 +147,7 @@ const BlockFrequency Freq) const; void view(const Twine &Name, bool isSimple = true); uint64_t getEntryFreq() const; + const MachineBlockFrequencyInfo &getMBFI() { return MBFI; } private: const MachineBlockFrequencyInfo &MBFI; @@ -154,6 +157,7 @@ private: MBFIWrapper &MBBFreqInfo; const MachineBranchProbabilityInfo &MBPI; + ProfileSummaryInfo *PSI; bool TailMergeBlocks(MachineFunction &MF); bool TryTailMergeBlocks(MachineBasicBlock* SuccBB, diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -38,6 +39,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -103,6 +105,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -129,7 +132,8 @@ BranchFolder::MBFIWrapper MBBFreqInfo( getAnalysis()); BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo, - getAnalysis()); + getAnalysis(), + &getAnalysis().getPSI()); auto *MMIWP = getAnalysisIfAvailable(); return Folder.OptimizeFunction( MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(), @@ -139,9 +143,10 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, unsigned MinTailLength) : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength), - MBBFreqInfo(FreqInfo), MBPI(ProbInfo) { + MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) { if (MinCommonTailLength == 0) MinCommonTailLength = TailMergeSize; switch (FlagEnableTailMerge) { @@ -585,7 +590,9 @@ MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, DenseMap &EHScopeMembership, - bool AfterPlacement) { + bool AfterPlacement, + BranchFolder::MBFIWrapper &MBBFreqInfo, + ProfileSummaryInfo *PSI) { // It is never profitable to tail-merge blocks from two different EH scopes. if (!EHScopeMembership.empty()) { auto EHScope1 = EHScopeMembership.find(MBB1); @@ -682,7 +689,11 @@ // branch instruction, which is likely to be smaller than the 2 // instructions that would be deleted in the merge. MachineFunction *MF = MBB1->getParent(); - return EffectiveTailLen >= 2 && MF->getFunction().hasOptSize() && + bool OptForSize = + MF->getFunction().hasOptSize() || + (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) && + llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI())); + return EffectiveTailLen >= 2 && OptForSize && (FullBlockTail1 || FullBlockTail2); } @@ -704,7 +715,7 @@ CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, EHScopeMembership, - AfterBlockPlacement)) { + AfterBlockPlacement, MBBFreqInfo, PSI)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; @@ -1534,8 +1545,10 @@ } } - if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && - MF.getFunction().hasOptSize()) { + bool OptForSize = + MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI()); + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) { // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch // direction, thereby defeating careful block placement and regressing // performance. Therefore, only consider this for optsize functions. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -90,6 +90,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -256,6 +257,7 @@ const LoopInfo *LI; std::unique_ptr BFI; std::unique_ptr BPI; + ProfileSummaryInfo *PSI; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. @@ -298,7 +300,7 @@ /// Keep track of SExt promoted. ValueToSExts ValToSExtendedUses; - /// True if optimizing for size. + /// True if the function has the OptSize attribute. bool OptSize; /// DataLayout for the Function being processed. @@ -435,10 +437,8 @@ LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); + PSI = &getAnalysis().getPSI(); OptSize = F.hasOptSize(); - - ProfileSummaryInfo *PSI = - &getAnalysis().getPSI(); if (ProfileGuidedSectionPrefix) { if (PSI->isFunctionHotInCallGraph(&F, *BFI)) F.setSectionPrefix(".hot"); @@ -457,7 +457,9 @@ // bypassSlowDivision may create new BBs, but we don't want to reapply the // optimization to those blocks. BasicBlock* Next = BB->getNextNode(); - EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + // F.hasOptSize is already checked in the outer if statement. + if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); BB = Next; } } @@ -1938,7 +1940,8 @@ // cold block. This interacts with our handling for loads and stores to // ensure that we can fold all uses of a potential addressing computation // into their uses. TODO: generalize this to work over profiling data - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get()); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) for (auto &Arg : CI->arg_operands()) { if (!Arg->getType()->isPointerTy()) continue; @@ -2875,16 +2878,24 @@ /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; + /// True if we are optimizing for size. + bool OptSize; + + ProfileSummaryInfo *PSI; + BlockFrequencyInfo *BFI; + AddressingModeMatcher( SmallVectorImpl &AMI, const TargetLowering &TLI, const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), - PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) { + PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), + OptSize(OptSize), PSI(PSI), BFI(BFI) { IgnoreProfitability = false; } @@ -2902,12 +2913,14 @@ const TargetLowering &TLI, const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) { + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { ExtAddrMode Result; bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT, LargeOffsetGEP) + PromotedInsts, TPT, LargeOffsetGEP, + OptSize, PSI, BFI) .matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; @@ -4518,7 +4531,8 @@ Instruction *I, SmallVectorImpl> &MemoryUses, SmallPtrSetImpl &ConsideredInsts, const TargetLowering &TLI, - const TargetRegisterInfo &TRI, int SeenInsts = 0) { + const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, int SeenInsts = 0) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -4527,8 +4541,6 @@ if (!MightBeFoldableInst(I)) return true; - const bool OptSize = I->getFunction()->hasOptSize(); - // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { // Conservatively return true if we're seeing a large number or a deep chain @@ -4569,7 +4581,9 @@ if (CallInst *CI = dyn_cast(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) continue; InlineAsm *IA = dyn_cast(CI->getCalledValue()); @@ -4581,8 +4595,8 @@ continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, - SeenInsts)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI, SeenInsts)) return true; } @@ -4670,7 +4684,8 @@ // the use is just a particularly nice way of sinking it. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -4706,7 +4721,7 @@ TPT.getRestorationPoint(); AddressingModeMatcher Matcher( MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); Matcher.IgnoreProfitability = true; bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -4812,7 +4827,8 @@ 0); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, + BFI.get()); GetElementPtrInst *GEP = LargeOffsetGEP.first; if (GEP && !NewGEPBases.count(GEP)) { @@ -6030,7 +6046,9 @@ /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // If branch conversion isn't desirable, exit early. - if (DisableSelectToBranch || OptSize || !TLI) + if (DisableSelectToBranch || + OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) || + !TLI) return false; // Find all consecutive select instructions that share the same condition. diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -13,6 +13,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -21,6 +23,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/SizeOpts.h" using namespace llvm; @@ -721,7 +724,8 @@ /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] /// ret i32 %phi.res static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { + const TargetLowering *TLI, const DataLayout *DL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { NumMemCmpCalls++; // Early exit from expansion if -Oz. @@ -742,18 +746,20 @@ // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(), + bool OptForSize = CI->getFunction()->hasOptSize() || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp); if (!Options) return false; if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; - if (CI->getFunction()->hasOptSize() && + if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize; - if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences()) + if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmp; MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL); @@ -799,7 +805,11 @@ &getAnalysis().getTLI(F); const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI); return !PA.areAllPreserved(); } @@ -807,22 +817,26 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); FunctionPass::getAnalysisUsage(AU); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL); + const TargetLowering* TL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); // Returns true if a change was made. bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); + const DataLayout& DL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI); }; bool ExpandMemCmpPass::runOnBlock( BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { + const DataLayout& DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { for (Instruction& I : BB) { CallInst *CI = dyn_cast(&I); if (!CI) { @@ -831,7 +845,7 @@ LibFunc Func; if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && - expandMemCmp(CI, TTI, TL, &DL)) { + expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) { return true; } } @@ -841,11 +855,12 @@ PreservedAnalyses ExpandMemCmpPass::runImpl( Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { + const TargetLowering* TL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { const DataLayout& DL = F.getParent()->getDataLayout(); bool MadeChanges = false; for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI)) { MadeChanges = true; // If changes were made, restart the function from the beginning, since // the structure of the function was changed. @@ -864,6 +879,8 @@ "Expand memcmp() to load/stores", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", "Expand memcmp() to load/stores", false, false) diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SparseSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -213,6 +214,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -434,6 +436,7 @@ INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { @@ -446,6 +449,8 @@ TRI = ST.getRegisterInfo(); BranchFolder::MBFIWrapper MBFI(getAnalysis()); MBPI = &getAnalysis(); + ProfileSummaryInfo *PSI = + &getAnalysis().getPSI(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); @@ -456,7 +461,7 @@ bool BFChange = false; if (!PreRegAlloc) { // Tail merge tend to expose more if-conversion opportunities. - BranchFolder BF(true, false, MBFI, *MBPI); + BranchFolder BF(true, false, MBFI, *MBPI, PSI); auto *MMIWP = getAnalysisIfAvailable(); BFChange = BF.OptimizeFunction( MF, TII, ST.getRegisterInfo(), @@ -598,7 +603,7 @@ BBAnalysis.clear(); if (MadeChange && IfCvtBranchFold) { - BranchFolder BF(false, false, MBFI, *MBPI); + BranchFolder BF(false, false, MBFI, *MBPI, PSI); auto *MMIWP = getAnalysisIfAvailable(); BF.OptimizeFunction( MF, TII, MF.getSubtarget().getRegisterInfo(), diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" @@ -41,6 +42,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -363,6 +365,8 @@ /// A handle to the post dominator tree. MachinePostDominatorTree *MPDT; + ProfileSummaryInfo *PSI; + /// Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -538,6 +542,7 @@ if (TailDupPlacement) AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -555,6 +560,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) @@ -2075,7 +2081,10 @@ // i.e. when the layout predecessor does not fallthrough to the loop header. // In practice this never happens though: there always seems to be a preheader // that can fallthrough and that is also placed before the header. - if (F->getFunction().hasOptSize()) + bool OptForSize = F->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(L.getHeader(), PSI, + &MBFI->getMBFI()); + if (OptForSize) return L.getHeader(); MachineBasicBlock *OldTop = nullptr; @@ -2831,6 +2840,11 @@ if (Freq < (LoopHeaderFreq * ColdProb)) continue; + // If the global profiles indicates so, don't align it. + if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) && + !TLI->alignLoopsWithOptSize()) + continue; + // Check for the existence of a non-layout predecessor which would benefit // from aligning this block. MachineBasicBlock *LayoutPred = @@ -3038,6 +3052,7 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MPDT = nullptr; + PSI = &getAnalysis().getPSI(); // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. @@ -3068,10 +3083,13 @@ if (allowTailDupPlacement()) { MPDT = &getAnalysis(); - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); + if (OptForSize) TailDupSize = 1; bool PreRegAlloc = false; - TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize); + TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI, + /* LayoutMode */ true, TailDupSize); precomputeTriangleChains(); } @@ -3087,7 +3105,7 @@ if (MF.size() > 3 && EnableTailMerge) { unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, - *MBPI, TailMergeSize); + *MBPI, PSI, TailMergeSize); auto *MMIWP = getAnalysisIfAvailable(); if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -12,11 +12,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -67,6 +70,8 @@ MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; + MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; TargetSchedModel TSchedModel; @@ -83,7 +88,7 @@ StringRef getPassName() const override { return "Machine InstCombiner"; } private: - bool doSubstitute(unsigned NewSize, unsigned OldSize); + bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); bool combineInstructions(MachineBasicBlock *); MachineInstr *getOperandDef(const MachineOperand &MO); unsigned getDepth(SmallVectorImpl &InsInstrs, @@ -132,6 +137,8 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -409,8 +416,9 @@ /// \returns true when new instruction sequence should be generated /// independent if it lengthens critical path or not -bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { - if (OptSize && (NewSize < OldSize)) +bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize, + bool OptForSize) { + if (OptForSize && (NewSize < OldSize)) return true; if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; @@ -508,6 +516,8 @@ SparseSet RegUnits; RegUnits.setUniverse(TRI->getNumRegUnits()); + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; SmallVector Patterns; @@ -584,7 +594,8 @@ // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + if (SubstituteAlways || + doSubstitute(NewInstCount, OldInstCount, OptForSize)) { insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, RegUnits, IncrementalUpdate); // Eagerly stop after the first pattern fires. @@ -639,6 +650,10 @@ MRI = &MF.getRegInfo(); MLI = &getAnalysis(); Traces = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; MinInstr = nullptr; OptSize = MF.getFunction().hasOptSize(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -27,8 +27,10 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/FastISel.h" @@ -334,6 +336,8 @@ AU.addRequired(); if (UseMBPI && OptLevel != CodeGenOpt::None) AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -436,14 +440,17 @@ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable(); LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable(), - nullptr, nullptr); + getAnalysisIfAvailable(), PSI, BFI); FuncInfo->set(Fn, *MF, CurDAG); SwiftError->setFunction(*MF); diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp --- a/llvm/lib/CodeGen/TailDuplication.cpp +++ b/llvm/lib/CodeGen/TailDuplication.cpp @@ -12,6 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -38,6 +40,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -75,7 +79,11 @@ return false; auto MBPI = &getAnalysis(); - Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false); + auto *PSI = &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks()) diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -19,13 +19,16 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -77,6 +80,8 @@ void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPIin, + const MachineBlockFrequencyInfo *MBFIin, + ProfileSummaryInfo *PSIin, bool LayoutModeIn, unsigned TailDupSizeIn) { MF = &MFin; TII = MF->getSubtarget().getInstrInfo(); @@ -84,6 +89,8 @@ MRI = &MF->getRegInfo(); MMI = &MF->getMMI(); MBPI = MBPIin; + MBFI = MBFIin; + PSI = PSIin; TailDupSize = TailDupSizeIn; assert(MBPI != nullptr && "Machine Branch Probability Info required"); @@ -555,14 +562,14 @@ // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; - if (TailDupSize == 0 && - TailDuplicateSize.getNumOccurrences() == 0 && - MF->getFunction().hasOptSize()) - MaxDuplicateCount = 1; - else if (TailDupSize == 0) + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI); + if (TailDupSize == 0) MaxDuplicateCount = TailDuplicateSize; else MaxDuplicateCount = TailDupSize; + if (OptForSize) + MaxDuplicateCount = 1; // If the block to be duplicated ends in an unanalyzable fallthrough, don't // duplicate it. diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -48,11 +48,14 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" @@ -113,6 +116,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); // Machine loop info is used to // guide some heuristics. + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -140,6 +145,9 @@ /// Register Liveness information after the current instruction. LivePhysRegs LiveRegs; + + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; }; char FixupBWInstPass::ID = 0; } @@ -154,8 +162,11 @@ this->MF = &MF; TII = MF.getSubtarget().getInstrInfo(); - OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LiveRegs.init(TII->getRegisterInfo()); LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";); @@ -426,6 +437,9 @@ // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); + OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -25,6 +25,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -32,6 +34,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -247,6 +250,12 @@ static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: using MemOpMap = DenseMap>; @@ -681,6 +690,11 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // Process all basic blocks. for (auto &MBB : MF) { @@ -699,7 +713,9 @@ // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + if (OptForSize) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -17,8 +17,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Function.h" @@ -52,6 +55,12 @@ bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -105,6 +114,12 @@ TSM.init(&MF.getSubtarget()); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); @@ -118,6 +133,11 @@ MachineBasicBlock *MBB = I->first; unsigned Cycles = I->second; + // Function::hasOptSize is already checked above. + bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) + continue; + if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions // trailing the terminator. diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -45,6 +46,10 @@ ; CHECK-NEXT: Analysis for ComputingKnownBits ; CHECK-NEXT: InstructionSelect ; CHECK-NEXT: ResetMachineFunction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -35,6 +35,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -78,10 +81,13 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -93,6 +99,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: AArch64 Conditional Compares +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: AArch64 Conditional Branch Tuning ; CHECK-NEXT: Machine Trace Metrics @@ -149,6 +156,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -19,6 +19,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -67,8 +70,11 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: ARM Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -124,6 +130,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -14,6 +14,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -37,6 +38,10 @@ ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -32,6 +32,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -64,12 +67,15 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -80,6 +86,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Early If-Conversion +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: X86 cmov Conversion ; CHECK-NEXT: MachineDominator Tree Construction @@ -94,6 +101,7 @@ ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: Live Range Shrink ; CHECK-NEXT: X86 Fixup SetCC +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame ; CHECK-NEXT: X86 Avoid Store Forwarding Block @@ -139,6 +147,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass @@ -157,7 +166,9 @@ ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Byte/Word Instruction Fixup +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible