Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -52,6 +52,8 @@ // because the number of profile counts required to reach the hot // percentile is above a huge threshold. Optional HasHugeWorkingSetSize; + Optional PgsoHotCountThreshold; + Optional PgsoHasHugeWorkingSetSize; public: ProfileSummaryInfo(Module &M) : M(M) {} @@ -96,6 +98,9 @@ bool AllowSynthetic = false); /// Returns true if the working set size of the code is considered huge. bool hasHugeWorkingSetSize(); + /// Returns true if the working set size of the code is considered huge for + /// PGSO. + bool pgsoHasHugeWorkingSetSize(); /// Returns true if \p F has hot function entry. bool isFunctionEntryHot(const Function *F); /// Returns true if \p F contains hot code. @@ -104,14 +109,20 @@ bool isFunctionEntryCold(const Function *F); /// Returns true if \p F contains only cold code. bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI); + /// Returns true if \p F contains hot code for PGSO. + bool isFunctionPgsoHotInCallGraph(const Function *F, BlockFrequencyInfo &BFI); /// Returns true if count \p C is considered hot. bool isHotCount(uint64_t C); /// Returns true if count \p C is considered cold. bool isColdCount(uint64_t C); + /// Returns true if count \p C is considered hot for PGSO. + bool isPgsoHotCount(uint64_t C); /// Returns true if BasicBlock \p BB is considered hot. bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI); /// Returns true if BasicBlock \p BB is considered cold. bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI); + /// Returns true if BasicBlock \p BB is considered hot for PGSO. + bool isPgsoHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI); /// Returns true if CallSite \p CS is considered hot. bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); /// Returns true if Callsite \p CS is considered cold. Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -40,12 +40,14 @@ } class AssumptionCache; +class BlockFrequencyInfo; class BranchInst; class Function; class GlobalValue; class IntrinsicInst; class LoadInst; class Loop; +class ProfileSummaryInfo; class SCEV; class ScalarEvolution; class StoreInst; @@ -297,7 +299,9 @@ /// \p JTSize Set a jump table size only when \p SI is suitable for a jump /// table. unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) const; + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) const; /// Estimate the cost of a given IR user when lowered. /// @@ -1167,7 +1171,9 @@ const User *U) = 0; virtual int getMemcpyCost(const Instruction *I) = 0; virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) = 0; + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) = 0; virtual int getUserCost(const User *U, ArrayRef Operands) = 0; virtual bool hasBranchDivergence() = 0; @@ -1636,8 +1642,10 @@ return Impl.getMaxInterleaveFactor(VF); } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) override { - return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize); + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) override { + return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI); } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -114,7 +114,9 @@ } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) { + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { JTSize = 0; return SI.getNumCases(); } Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -48,6 +48,7 @@ class GlobalValue; class GlobalVariable; class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineConstantPoolValue; class MachineDominatorTree; class MachineFunction; @@ -69,6 +70,7 @@ class MCTargetOptions; class MDNode; class Module; +class ProfileSummaryInfo; class raw_ostream; class StackMaps; class TargetLoweringObjectFile; @@ -107,6 +109,10 @@ /// Optimization remark emitter. MachineOptimizationRemarkEmitter *ORE; + MachineBlockFrequencyInfo *MBFI; + + ProfileSummaryInfo *PSI; + /// The symbol for the current function. This is recalculated at the beginning /// of each call to runOnMachineFunction(). MCSymbol *CurrentFnSym = nullptr; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -325,7 +325,9 @@ } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JumpTableSize) { + unsigned &JumpTableSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { /// Try to find the estimated number of clusters. Note that the number of /// clusters identified in this function could be different from the actual /// numbers found in lowering. This function ignore switches that are @@ -373,7 +375,7 @@ (MaxCaseVal - MinCaseVal) .getLimitedValue(std::numeric_limits::max() - 1) + 1; // Check whether a range of clusters is dense enough for a jump table - if (TLI->isSuitableForJumpTable(&SI, N, Range)) { + if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { JumpTableSize = Range; return 1; } Index: include/llvm/CodeGen/ExecutionDomainFix.h =================================================================== --- include/llvm/CodeGen/ExecutionDomainFix.h +++ include/llvm/CodeGen/ExecutionDomainFix.h @@ -23,6 +23,8 @@ #define LLVM_CODEGEN_EXECUTIONDOMAINFIX_H #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LoopTraversal.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -126,6 +128,9 @@ ReachingDefAnalysis *RDA; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; + public: ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC) : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} @@ -133,6 +138,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } Index: include/llvm/CodeGen/LiveRangeEdit.h =================================================================== --- include/llvm/CodeGen/LiveRangeEdit.h +++ include/llvm/CodeGen/LiveRangeEdit.h @@ -34,6 +34,7 @@ namespace llvm { class LiveIntervals; +class ProfileSummaryInfo; class MachineBlockFrequencyInfo; class MachineInstr; class MachineLoopInfo; @@ -103,14 +104,17 @@ /// foldAsLoad - If LI has a single use and a single def that can be folded as /// a load, eliminate the register by folding the def into the use. - bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead); + bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI); using ToShrinkSet = SetVector, SmallPtrSet>; /// Helper for eliminateDeadDefs. void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, - AliasAnalysis *AA); + AliasAnalysis *AA, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI); /// MachineRegisterInfo callback to notify when new virtual /// registers are created. @@ -243,6 +247,8 @@ /// allocator. These registers should not be split into new intervals /// as currently those new intervals are not guaranteed to spill. void eliminateDeadDefs(SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, ArrayRef RegsBeingSpilled = None, AliasAnalysis *AA = nullptr); Index: include/llvm/CodeGen/MachineSizeOpts.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/MachineSizeOpts.h @@ -0,0 +1,35 @@ +//===- MachineSizeOpts.h - machine size optimization ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains some shared machine IR code size optimization related +// code. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CODEGEN_MACHINE_SIZEOPTS_H +#define LLVM_CODEGEN_MACHINE_SIZEOPTS_H + +namespace llvm { + +class ProfileSummaryInfo; +class MachineBasicBlock; +class MachineBlockFrequencyInfo; +class MachineFunction; + +/// Returns true if machine function \p MF is suggested to be size-optimized +/// base on the profile. +bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *BFI); +/// Returns true if machine basic block \p MBB is suggested to be size-optimized +/// base on the profile. +bool shouldOptimizeForSize(const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI); + +} // end namespace llvm + +#endif // LLVM_CODEGEN_MACHINE_SIZEOPTS_H Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -59,6 +59,7 @@ namespace llvm { class BlockAddress; +class BlockFrequencyInfo; class Constant; class ConstantFP; class ConstantInt; @@ -71,6 +72,7 @@ class MachineConstantPoolValue; class MCSymbol; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class SDDbgValue; class SDDbgLabel; class SelectionDAG; @@ -235,6 +237,9 @@ /// whenever manipulating the DAG. OptimizationRemarkEmitter *ORE; + ProfileSummaryInfo *PSI; + BlockFrequencyInfo *BFI; + /// The starting token. SDNode EntryNode; @@ -397,7 +402,8 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - LegacyDivergenceAnalysis * Divergence); + LegacyDivergenceAnalysis * Divergence, + ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin); void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { FLI = FuncInfo; @@ -419,6 +425,8 @@ const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; } LLVMContext *getContext() const {return Context; } OptimizationRemarkEmitter &getORE() const { return *ORE; } + ProfileSummaryInfo *getPSI() const { return PSI; } + BlockFrequencyInfo *getBFI() const { return BFI; } /// Pop up a GraphViz/gv window with the DAG rendered using 'dot'. void viewGraph(const std::string &Title); @@ -1693,6 +1701,8 @@ return It->second.HeapAllocSite; } + bool shouldOptForSize() const; + private: void InsertNode(SDNode *N); bool RemoveNodeFromCSEMaps(SDNode *N); Index: include/llvm/CodeGen/SelectionDAGISel.h =================================================================== --- include/llvm/CodeGen/SelectionDAGISel.h +++ include/llvm/CodeGen/SelectionDAGISel.h @@ -38,6 +38,8 @@ class GCFunctionInfo; class ScheduleDAGSDNodes; class LoadInst; + class ProfileSummaryInfo; + class BlockFrequencyInfo; /// SelectionDAGISel - This is the common base class used for SelectionDAG-based /// pattern-matching instruction selectors. @@ -248,6 +250,11 @@ virtual StringRef getIncludePathForIndex(unsigned index) { llvm_unreachable("Tblgen should generate the implementation of this!"); } + + bool shouldOptForSize(const MachineFunction *MF) const { + return CurDAG->shouldOptForSize(); + } + public: // Calls to these predicates are generated by tblgen. bool CheckAndMask(SDValue LHS, ConstantSDNode *RHS, Index: include/llvm/CodeGen/SwitchLoweringUtils.h =================================================================== --- include/llvm/CodeGen/SwitchLoweringUtils.h +++ include/llvm/CodeGen/SwitchLoweringUtils.h @@ -19,6 +19,7 @@ class FunctionLoweringInfo; class MachineBasicBlock; +class BlockFrequencyInfo; namespace SwitchCG { @@ -263,7 +264,8 @@ std::vector BitTestCases; void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI, - MachineBasicBlock *DefaultMBB); + MachineBasicBlock *DefaultMBB, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); bool buildJumpTable(const CaseClusterVector &Clusters, unsigned First, unsigned Last, const SwitchInst *SI, @@ -294,4 +296,3 @@ } // namespace llvm #endif // LLVM_CODEGEN_SWITCHLOWERINGUTILS_H - Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -25,11 +25,13 @@ namespace llvm { class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineBranchProbabilityInfo; class MachineFunction; class MachineInstr; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class TargetRegisterInfo; /// Utility class to perform tail duplication. @@ -40,6 +42,8 @@ const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; MachineFunction *MF; + const MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; bool PreRegAlloc; bool LayoutMode; unsigned TailDupSize; @@ -65,6 +69,8 @@ /// default implies using the command line value TailDupSize. void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, + const MachineBlockFrequencyInfo *MBFI, + ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize = 0); bool tailDuplicateBlocks(); Index: include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- include/llvm/CodeGen/TargetInstrInfo.h +++ include/llvm/CodeGen/TargetInstrInfo.h @@ -42,12 +42,14 @@ class InstrItineraryData; class LiveIntervals; class LiveVariables; +class MachineBlockFrequencyInfo; class MachineMemOperand; class MachineRegisterInfo; class MCAsmInfo; class MCInst; struct MCSchedModel; class Module; +class ProfileSummaryInfo; class ScheduleDAG; class ScheduleHazardRecognizer; class SDNode; @@ -129,7 +131,10 @@ /// Do not call this method for a non-commutable instruction. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const; @@ -400,7 +405,9 @@ /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. MachineInstr * - commuteInstruction(MachineInstr &MI, bool NewMI = false, + commuteInstruction(MachineInstr &MI, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI = false, unsigned OpIdx1 = CommuteAnyOperandIndex, unsigned OpIdx2 = CommuteAnyOperandIndex) const; @@ -941,6 +948,8 @@ /// decide on using an opcode (note that those assignments can still change). MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, int FI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const; @@ -948,6 +957,8 @@ /// store from / to any address, not just from a specific stack slot. MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const; /// Return true when there is potentially a faster code sequence @@ -1032,6 +1043,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const { return nullptr; @@ -1045,6 +1058,7 @@ virtual MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const { return nullptr; } @@ -1331,7 +1345,9 @@ virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { return nullptr; } @@ -1453,7 +1469,9 @@ /// /// The bit (1 << Domain) must be set in the mask returned from /// getExecutionDomain(MI). - virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {} + virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const {} /// Returns the preferred minimum clearance /// before an instruction with an unwanted partial register update. Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -54,6 +55,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -1024,13 +1026,16 @@ /// Return true if lowering to a jump table is suitable for a set of case /// clusters which may contain \p NumCases cases, \p Range range of values. virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, - uint64_t Range) const { + uint64_t Range, ProfileSummaryInfo* PSI, + BlockFrequencyInfo *BFI) const { // FIXME: This function check the maximum table size and density, but the // minimum size is not checked. It would be nice if the minimum size is // also combined within this function. Currently, the minimum size check is // performed in findJumpTable() in SelectionDAGBuiler and // getEstimatedNumberOfCaseClusters() in BasicTTIImpl. - const bool OptForSize = SI->getParent()->getParent()->hasOptSize(); + const bool OptForSize = SI->getParent()->getParent()->hasOptSize() || + llvm::shouldOptimizeForSize(SI->getParent(), PSI, + BFI); const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize); const unsigned MaxJumpTableSize = getMaximumJumpTableSize(); Index: include/llvm/Transforms/Utils/SizeOpts.h =================================================================== --- include/llvm/Transforms/Utils/SizeOpts.h +++ include/llvm/Transforms/Utils/SizeOpts.h @@ -22,11 +22,11 @@ /// Returns true if function \p F is suggested to be size-optimized base on the /// profile. -bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, +bool shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); /// Returns true if basic block \p BB is suggested to be size-optimized base /// on the profile. -bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, +bool shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); } // end namespace llvm Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -1467,8 +1467,9 @@ } unsigned JumpTableSize = 0; + BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; unsigned NumCaseCluster = - TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); + TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); // If suitable for a jump table, consider the cost for the table size and // branch to destination. Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -57,6 +57,24 @@ cl::desc("A fixed cold count that overrides the count derived from" " profile-summary-cutoff-cold")); +static cl::opt PgsoCutoffInstrProf( + "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore, + cl::desc("A count is hot for PGSO if it exceeds the minimum count " + "to reach this percentile of total counts under instrumentation " + "profile.")); + +static cl::opt PgsoCutoffSampleProf( + "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore, + cl::desc("A count is hot for PGSO if it exceeds the minimum count " + "to reach this percentile of total counts under sample profile.")); + +static cl::opt PgsoHugeWorkingSetSizeThreshold( + "pgso-huge-working-set-size-threshold", cl::Hidden, + cl::init(12500), cl::ZeroOrMore, + cl::desc("The code working set size is considered huge for PGSO if the " + "number of blocks required to reach the " + "-profile-summary-cutoff-hot percentile exceeds this count.")); + // Find the summary entry for a desired percentile of counts. static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS, uint64_t Percentile) { @@ -186,6 +204,31 @@ return true; } +// Like isFunctionHotInCallGraph but for PGSO. +bool ProfileSummaryInfo::isFunctionPgsoHotInCallGraph(const Function *F, + BlockFrequencyInfo &BFI) { + if (!F || !computeSummary()) + return false; + if (auto FunctionCount = F->getEntryCount()) + if (isPgsoHotCount(FunctionCount.getCount())) + return true; + + if (hasSampleProfile()) { + uint64_t TotalCallCount = 0; + for (const auto &BB : *F) + for (const auto &I : BB) + if (isa(I) || isa(I)) + if (auto CallCount = getProfileCount(&I, nullptr)) + TotalCallCount += CallCount.getValue(); + if (isPgsoHotCount(TotalCallCount)) + return true; + } + for (const auto &BB : *F) + if (isPgsoHotBlock(&BB, &BFI)) + return true; + return false; +} + /// Returns true if the function's entry is a cold. If it returns false, it /// either means it is not cold or it is unknown whether it is cold or not (for /// example, no profile data is available). @@ -222,6 +265,11 @@ "Cold count threshold cannot exceed hot count threshold!"); HasHugeWorkingSetSize = HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold; + auto &PgsoHotEntry = getEntryForPercentile(DetailedSummary, + hasSampleProfile() ? PgsoCutoffSampleProf : PgsoCutoffInstrProf); + PgsoHotCountThreshold = PgsoHotEntry.MinCount; + PgsoHasHugeWorkingSetSize = + HotEntry.NumCounts > PgsoHugeWorkingSetSizeThreshold; } bool ProfileSummaryInfo::hasHugeWorkingSetSize() { @@ -230,6 +278,12 @@ return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue(); } +bool ProfileSummaryInfo::pgsoHasHugeWorkingSetSize() { + if (!PgsoHasHugeWorkingSetSize) + computeThresholds(); + return PgsoHasHugeWorkingSetSize && PgsoHasHugeWorkingSetSize.getValue(); +} + bool ProfileSummaryInfo::isHotCount(uint64_t C) { if (!HotCountThreshold) computeThresholds(); @@ -242,6 +296,12 @@ return ColdCountThreshold && C <= ColdCountThreshold.getValue(); } +bool ProfileSummaryInfo::isPgsoHotCount(uint64_t C) { + if (!PgsoHotCountThreshold) + computeThresholds(); + return PgsoHotCountThreshold && C >= PgsoHotCountThreshold.getValue(); +} + uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() { if (!HotCountThreshold) computeThresholds(); @@ -265,6 +325,12 @@ return Count && isColdCount(*Count); } +bool ProfileSummaryInfo::isPgsoHotBlock(const BasicBlock *BB, + BlockFrequencyInfo *BFI) { + auto Count = BFI->getBlockProfileCount(BB); + return Count && isPgsoHotCount(*Count); +} + bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI) { auto C = getProfileCount(CS.getInstruction(), BFI); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -199,9 +199,10 @@ } unsigned -TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) const { - return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize); +TargetTransformInfo::getEstimatedNumberOfCaseClusters( + const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) const { + return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI); } int TargetTransformInfo::getUserCost(const User *U, Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -31,13 +31,16 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -52,6 +55,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -252,6 +256,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } bool AsmPrinter::doInitialization(Module &M) { @@ -1658,6 +1664,10 @@ } ORE = &getAnalysis().getORE(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; } namespace { @@ -2882,8 +2892,10 @@ void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB, MCCodePaddingContext &Context) const { assert(MF != nullptr && "Machine function must be valid"); + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); Context.IsPaddingActive = !MF->hasInlineAsm() && - !MF->getFunction().hasOptSize() && + !OptForSize && TM.getOptLevel() != CodeGenOpt::None; Context.IsBasicBlockReachableViaFallthrough = std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) != Index: lib/CodeGen/BranchFolding.h =================================================================== --- lib/CodeGen/BranchFolding.h +++ lib/CodeGen/BranchFolding.h @@ -27,6 +27,7 @@ class MachineLoopInfo; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class raw_ostream; class TargetInstrInfo; class TargetRegisterInfo; @@ -39,6 +40,7 @@ bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, // Min tail length to merge. Defaults to commandline // flag. Ignored for optsize. unsigned MinTailLength = 0); @@ -145,6 +147,7 @@ const BlockFrequency Freq) const; void view(const Twine &Name, bool isSimple = true); uint64_t getEntryFreq() const; + const MachineBlockFrequencyInfo &getMBFI() { return MBFI; } private: const MachineBlockFrequencyInfo &MBFI; @@ -154,6 +157,7 @@ private: MBFIWrapper &MBBFreqInfo; const MachineBranchProbabilityInfo &MBPI; + ProfileSummaryInfo *PSI; bool TailMergeBlocks(MachineFunction &MF); bool TryTailMergeBlocks(MachineBasicBlock* SuccBB, Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -38,6 +39,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -102,6 +104,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -128,7 +131,8 @@ BranchFolder::MBFIWrapper MBBFreqInfo( getAnalysis()); BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo, - getAnalysis()); + getAnalysis(), + &getAnalysis().getPSI()); return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(), getAnalysisIfAvailable()); @@ -137,9 +141,10 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, unsigned MinTailLength) : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength), - MBBFreqInfo(FreqInfo), MBPI(ProbInfo) { + MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) { if (MinCommonTailLength == 0) MinCommonTailLength = TailMergeSize; switch (FlagEnableTailMerge) { @@ -635,7 +640,9 @@ MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, DenseMap &EHScopeMembership, - bool AfterPlacement) { + bool AfterPlacement, + BranchFolder::MBFIWrapper &MBBFreqInfo, + ProfileSummaryInfo *PSI) { // It is never profitable to tail-merge blocks from two different EH scopes. if (!EHScopeMembership.empty()) { auto EHScope1 = EHScopeMembership.find(MBB1); @@ -721,7 +728,11 @@ // branch instruction, which is likely to be smaller than the 2 // instructions that would be deleted in the merge. MachineFunction *MF = MBB1->getParent(); - return EffectiveTailLen >= 2 && MF->getFunction().hasOptSize() && + bool OptForSize = + MF->getFunction().hasOptSize() || + (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) && + llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI())); + return EffectiveTailLen >= 2 && OptForSize && (I1 == MBB1->begin() || I2 == MBB2->begin()); } @@ -743,7 +754,7 @@ CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, EHScopeMembership, - AfterBlockPlacement)) { + AfterBlockPlacement, MBBFreqInfo, PSI)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; @@ -1571,8 +1582,10 @@ } } - if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && - MF.getFunction().hasOptSize()) { + bool OptForSize = + MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI()); + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) { // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch // direction, thereby defeating careful block placement and regressing // performance. Therefore, only consider this for optsize functions. Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -91,6 +91,7 @@ MachineRegisterInfo.cpp MachineScheduler.cpp MachineSink.cpp + MachineSizeOpts.cpp MachineSSAUpdater.cpp MachineTraceMetrics.cpp MachineVerifier.cpp Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -251,6 +252,7 @@ const LoopInfo *LI; std::unique_ptr BFI; std::unique_ptr BPI; + ProfileSummaryInfo *PSI; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. @@ -293,7 +295,7 @@ /// Keep track of SExt promoted. ValueToSExts ValToSExtendedUses; - /// True if optimizing for size. + /// True if the function has the OptSize attribute. bool OptSize; /// DataLayout for the Function being processed. @@ -429,10 +431,8 @@ LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); + PSI = &getAnalysis().getPSI(); OptSize = F.hasOptSize(); - - ProfileSummaryInfo *PSI = - &getAnalysis().getPSI(); if (ProfileGuidedSectionPrefix) { if (PSI->isFunctionHotInCallGraph(&F, *BFI)) F.setSectionPrefix(".hot"); @@ -451,7 +451,9 @@ // bypassSlowDivision may create new BBs, but we don't want to reapply the // optimization to those blocks. BasicBlock* Next = BB->getNextNode(); - EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + // F.hasOptSize is already checked in the outer if statement. + if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); BB = Next; } } @@ -1842,7 +1844,8 @@ // cold block. This interacts with our handling for loads and stores to // ensure that we can fold all uses of a potential addressing computation // into their uses. TODO: generalize this to work over profiling data - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get()); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) for (auto &Arg : CI->arg_operands()) { if (!Arg->getType()->isPointerTy()) continue; @@ -2791,16 +2794,24 @@ /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; + /// True if we are optimizing for size. + bool OptSize; + + ProfileSummaryInfo *PSI; + BlockFrequencyInfo *BFI; + AddressingModeMatcher( SmallVectorImpl &AMI, const TargetLowering &TLI, const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), - PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) { + PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), + OptSize(OptSize), PSI(PSI), BFI(BFI) { IgnoreProfitability = false; } @@ -2818,12 +2829,14 @@ const TargetLowering &TLI, const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) { + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { ExtAddrMode Result; bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT, LargeOffsetGEP) + PromotedInsts, TPT, LargeOffsetGEP, + OptSize, PSI, BFI) .matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; @@ -4437,7 +4450,8 @@ Instruction *I, SmallVectorImpl> &MemoryUses, SmallPtrSetImpl &ConsideredInsts, const TargetLowering &TLI, - const TargetRegisterInfo &TRI, int SeenInsts = 0) { + const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, int SeenInsts = 0) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -4446,8 +4460,6 @@ if (!MightBeFoldableInst(I)) return true; - const bool OptSize = I->getFunction()->hasOptSize(); - // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { // Conservatively return true if we're seeing a large number or a deep chain @@ -4488,7 +4500,9 @@ if (CallInst *CI = dyn_cast(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) continue; InlineAsm *IA = dyn_cast(CI->getCalledValue()); @@ -4500,8 +4514,8 @@ continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, - SeenInsts)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI, SeenInsts)) return true; } @@ -4589,7 +4603,8 @@ // the use is just a particularly nice way of sinking it. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -4625,7 +4640,7 @@ TPT.getRestorationPoint(); AddressingModeMatcher Matcher( MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); Matcher.IgnoreProfitability = true; bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -4731,7 +4746,8 @@ 0); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, + BFI.get()); GetElementPtrInst *GEP = LargeOffsetGEP.first; if (GEP && !NewGEPBases.count(GEP)) { @@ -5949,7 +5965,9 @@ /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // If branch conversion isn't desirable, exit early. - if (DisableSelectToBranch || OptSize || !TLI) + if (DisableSelectToBranch || + OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) || + !TLI) return false; // Find all consecutive select instructions that share the same condition. Index: lib/CodeGen/ExecutionDomainFix.cpp =================================================================== --- lib/CodeGen/ExecutionDomainFix.cpp +++ lib/CodeGen/ExecutionDomainFix.cpp @@ -113,7 +113,7 @@ // Collapse all the instructions. while (!dv->Instrs.empty()) - TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain); + TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain, PSI, MBFI); dv->setSingleDomain(domain); // If there are multiple users, give them new, unique DomainValues. @@ -318,7 +318,7 @@ // If the collapsed operands force a single domain, propagate the collapse. if (isPowerOf2_32(available)) { unsigned domain = countTrailingZeros(available); - TII->setExecutionDomain(*mi, domain); + TII->setExecutionDomain(*mi, domain, PSI, MBFI); visitHardInstr(mi, domain); return; } @@ -436,6 +436,11 @@ RDA = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + // Initialize the AliasMap on the first use. if (AliasMap.empty()) { // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and Index: lib/CodeGen/ExpandMemCmp.cpp =================================================================== --- lib/CodeGen/ExpandMemCmp.cpp +++ lib/CodeGen/ExpandMemCmp.cpp @@ -13,6 +13,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -20,6 +22,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Utils/SizeOpts.h" using namespace llvm; @@ -720,7 +723,8 @@ /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] /// ret i32 %phi.res static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { + const TargetLowering *TLI, const DataLayout *DL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { NumMemCmpCalls++; // Early exit from expansion if -Oz. @@ -741,18 +745,20 @@ // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(), + bool OptForSize = CI->getFunction()->hasOptSize() || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp); if (!Options) return false; if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; - if (CI->getFunction()->hasOptSize() && + if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize; - if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences()) + if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmp; MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL); @@ -798,7 +804,11 @@ &getAnalysis().getTLI(); const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI); return !PA.areAllPreserved(); } @@ -806,22 +816,26 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); FunctionPass::getAnalysisUsage(AU); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL); + const TargetLowering* TL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); // Returns true if a change was made. bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); + const DataLayout& DL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI); }; bool ExpandMemCmpPass::runOnBlock( BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { + const DataLayout& DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { for (Instruction& I : BB) { CallInst *CI = dyn_cast(&I); if (!CI) { @@ -830,7 +844,7 @@ LibFunc Func; if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && - expandMemCmp(CI, TTI, TL, &DL)) { + expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) { return true; } } @@ -840,11 +854,12 @@ PreservedAnalyses ExpandMemCmpPass::runImpl( Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { + const TargetLowering* TL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { const DataLayout& DL = F.getParent()->getDataLayout(); bool MadeChanges = false; for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI)) { MadeChanges = true; // If changes were made, restart the function from the beginning, since // the structure of the function was changed. @@ -863,6 +878,8 @@ "Expand memcmp() to load/stores", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", "Expand memcmp() to load/stores", false, false) Index: lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- lib/CodeGen/GlobalISel/IRTranslator.cpp +++ lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -464,7 +464,7 @@ return true; } - SL->findJumpTables(Clusters, &SI, DefaultMBB); + SL->findJumpTables(Clusters, &SI, DefaultMBB, nullptr, nullptr); LLVM_DEBUG({ dbgs() << "Case clusters: "; Index: lib/CodeGen/IfConversion.cpp =================================================================== --- lib/CodeGen/IfConversion.cpp +++ lib/CodeGen/IfConversion.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SparseSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -211,6 +212,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -333,6 +335,7 @@ INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { @@ -345,6 +348,8 @@ TRI = ST.getRegisterInfo(); BranchFolder::MBFIWrapper MBFI(getAnalysis()); MBPI = &getAnalysis(); + ProfileSummaryInfo *PSI = + &getAnalysis().getPSI(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); @@ -355,7 +360,7 @@ bool BFChange = false; if (!PreRegAlloc) { // Tail merge tend to expose more if-conversion opportunities. - BranchFolder BF(true, false, MBFI, *MBPI); + BranchFolder BF(true, false, MBFI, *MBPI, PSI); BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(), getAnalysisIfAvailable()); } @@ -495,7 +500,7 @@ BBAnalysis.clear(); if (MadeChange && IfCvtBranchFold) { - BranchFolder BF(false, false, MBFI, *MBPI); + BranchFolder BF(false, false, MBFI, *MBPI, PSI); BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), getAnalysisIfAvailable()); } Index: lib/CodeGen/InlineSpiller.cpp =================================================================== --- lib/CodeGen/InlineSpiller.cpp +++ lib/CodeGen/InlineSpiller.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -94,6 +95,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; InsertPointAnalysis IPA; @@ -146,6 +148,7 @@ MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), + PSI(&pass.getAnalysis().getPSI()), IPA(LIS, mf.getNumBlockIDs()) {} void addToMergeableSpills(MachineInstr &Spill, int StackSlot, @@ -167,6 +170,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit; @@ -202,6 +206,7 @@ MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), + PSI(&pass.getAnalysis().getPSI()), HSpiller(pass, mf, vrm) {} void spill(LiveRangeEdit &) override; @@ -684,7 +689,7 @@ if (DeadDefs.empty()) return; LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n"); - Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA); + Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA); // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions // after rematerialization. To remove a VNI for a vreg from its LiveInterval, @@ -835,8 +840,9 @@ MachineInstrSpan MIS(MI, MI->getParent()); MachineInstr *FoldMI = - LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS) - : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM); + LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, PSI, &MBFI, &LIS) + : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, PSI, &MBFI, &LIS, + &VRM); if (!FoldMI) return false; @@ -1085,7 +1091,7 @@ // Hoisted spills may cause dead code. if (!DeadDefs.empty()) { LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n"); - Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA); + Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA); } // Finally delete the SnippetCopies. @@ -1524,7 +1530,7 @@ RMEnt->RemoveOperand(i - 1); } } - Edit.eliminateDeadDefs(SpillsToRm, None, AA); + Edit.eliminateDeadDefs(SpillsToRm, PSI, &MBFI, None, AA); } } Index: lib/CodeGen/LiveRangeEdit.cpp =================================================================== --- lib/CodeGen/LiveRangeEdit.cpp +++ lib/CodeGen/LiveRangeEdit.cpp @@ -12,8 +12,10 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/VirtRegMap.h" @@ -183,7 +185,9 @@ } bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, - SmallVectorImpl &Dead) { + SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { MachineInstr *DefMI = nullptr, *UseMI = nullptr; // Check that there is a single def and a single use. @@ -226,7 +230,8 @@ if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second) return false; - MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS); + MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, + PSI, MBFI, &LIS); if (!FoldMI) return false; LLVM_DEBUG(dbgs() << " folded: " << *FoldMI); @@ -258,7 +263,8 @@ /// Find all live intervals that need to shrink, then remove the instruction. void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, - AliasAnalysis *AA) { + AliasAnalysis *AA, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { assert(MI->allDefsAreDead() && "Def isn't really dead"); SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot(); @@ -390,6 +396,8 @@ } void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, ArrayRef RegsBeingSpilled, AliasAnalysis *AA) { ToShrinkSet ToShrink; @@ -397,7 +405,7 @@ for (;;) { // Erase all dead defs. while (!Dead.empty()) - eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA); + eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA, PSI, MBFI); if (ToShrink.empty()) break; @@ -405,7 +413,7 @@ // Shrink just one live interval. Then delete new dead defs. LiveInterval *LI = ToShrink.back(); ToShrink.pop_back(); - if (foldAsLoad(LI, Dead)) + if (foldAsLoad(LI, Dead, PSI, MBFI)) continue; unsigned VReg = LI->reg; if (TheDelegate) Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" @@ -41,6 +42,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -361,6 +363,8 @@ /// A handle to the post dominator tree. MachinePostDominatorTree *MPDT; + ProfileSummaryInfo *PSI; + /// Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -536,6 +540,7 @@ if (TailDupPlacement) AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -553,6 +558,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) @@ -2024,7 +2030,10 @@ // i.e. when the layout predecessor does not fallthrough to the loop header. // In practice this never happens though: there always seems to be a preheader // that can fallthrough and that is also placed before the header. - if (F->getFunction().hasOptSize()) + bool OptForSize = F->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(L.getHeader(), PSI, + &MBFI->getMBFI()); + if (OptForSize) return L.getHeader(); MachineBasicBlock *OldTop = nullptr; @@ -2780,6 +2789,11 @@ if (Freq < (LoopHeaderFreq * ColdProb)) continue; + // If the global profiles indicates so, don't align it. + if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) && + !TLI->alignLoopsWithOptSize()) + continue; + // Check for the existence of a non-layout predecessor which would benefit // from aligning this block. MachineBasicBlock *LayoutPred = @@ -2987,6 +3001,7 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MPDT = nullptr; + PSI = &getAnalysis().getPSI(); // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. @@ -3017,10 +3032,13 @@ if (allowTailDupPlacement()) { MPDT = &getAnalysis(); - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); + if (OptForSize) TailDupSize = 1; bool PreRegAlloc = false; - TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize); + TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI, + /* LayoutMode */ true, TailDupSize); precomputeTriangleChains(); } @@ -3036,7 +3054,7 @@ if (MF.size() > 3 && EnableTailMerge) { unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, - *MBPI, TailMergeSize); + *MBPI, PSI, TailMergeSize); if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), getAnalysisIfAvailable(), MLI, Index: lib/CodeGen/MachineCSE.cpp =================================================================== --- lib/CodeGen/MachineCSE.cpp +++ lib/CodeGen/MachineCSE.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -67,6 +68,7 @@ AliasAnalysis *AA; MachineDominatorTree *DT; MachineRegisterInfo *MRI; + ProfileSummaryInfo *PSI; MachineBlockFrequencyInfo *MBFI; public: @@ -87,6 +89,7 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); } void releaseMemory() override { @@ -538,7 +541,7 @@ // Commute commutable instructions. bool Commuted = false; if (!FoundCSE && MI->isCommutable()) { - if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) { + if (MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI)) { Commuted = true; FoundCSE = VNT.count(NewMI); if (NewMI != MI) { @@ -547,7 +550,7 @@ Changed = true; } else if (!FoundCSE) // MI was changed but it didn't help, commute it back! - (void)TII->commuteInstruction(*MI); + (void)TII->commuteInstruction(*MI, PSI, MBFI); } } @@ -889,6 +892,7 @@ DT = &getAnalysis(); MBFI = &getAnalysis(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); + PSI = &getAnalysis().getPSI(); bool ChangedPRE, ChangedCSE; ChangedPRE = PerformSimplePRE(DT); ChangedCSE = PerformCSE(DT->getRootNode()); Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -12,11 +12,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -66,6 +69,8 @@ MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; + MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; TargetSchedModel TSchedModel; @@ -82,7 +87,7 @@ StringRef getPassName() const override { return "Machine InstCombiner"; } private: - bool doSubstitute(unsigned NewSize, unsigned OldSize); + bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); bool combineInstructions(MachineBasicBlock *); MachineInstr *getOperandDef(const MachineOperand &MO); unsigned getDepth(SmallVectorImpl &InsInstrs, @@ -131,6 +136,8 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -408,8 +415,9 @@ /// \returns true when new instruction sequence should be generated /// independent if it lengthens critical path or not -bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { - if (OptSize && (NewSize < OldSize)) +bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize, + bool OptForSize) { + if (OptForSize && (NewSize < OldSize)) return true; if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; @@ -507,6 +515,8 @@ SparseSet RegUnits; RegUnits.setUniverse(TRI->getNumRegUnits()); + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; SmallVector Patterns; @@ -583,7 +593,8 @@ // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + if (SubstituteAlways || + doSubstitute(NewInstCount, OldInstCount, OptForSize)) { insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, RegUnits, IncrementalUpdate); // Eagerly stop after the first pattern fires. @@ -638,6 +649,10 @@ MRI = &MF.getRegInfo(); MLI = &getAnalysis(); Traces = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; MinInstr = nullptr; OptSize = MF.getFunction().hasOptSize(); Index: lib/CodeGen/MachineSizeOpts.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MachineSizeOpts.cpp @@ -0,0 +1,101 @@ +//===- MachineSizeOpts.cpp - code size optimization related code ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains some shared machine IR code size optimization related +// code. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineSizeOpts.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/Transforms/Utils/SizeOpts.h" + +using namespace llvm; + +extern cl::opt EnablePGSO; +extern cl::opt PGSOHugeWorkingSetSizeOnly; +extern cl::opt ForcePGSO; + +/// Like ProfileSummaryInfo::isColdBlock but for MachineBasicBlock. +static bool isColdBlock(const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { + auto Count = MBFI->getBlockProfileCount(MBB); + return Count && PSI->isColdCount(*Count); +} + +/// Like ProfileSummaryInfo::isPgsoHotBlock but for MachineBasicBlock. +static bool isPgsoHotBlock(const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { + auto Count = MBFI->getBlockProfileCount(MBB); + return Count && PSI->isPgsoHotCount(*Count); +} + +/// Like ProfileSummaryInfo::isFunctionColdInCallGraph but for +/// MachineFunction. +static bool isFunctionColdInCallGraph( + const MachineFunction *MF, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo &MBFI) { + if (auto FunctionCount = MF->getFunction().getEntryCount()) + if (!PSI->isColdCount(FunctionCount.getCount())) + return false; + for (const auto &MBB : *MF) + if (!isColdBlock(&MBB, PSI, &MBFI)) + return false; + return true; +} + +/// Like ProfileSummaryInfo::isFunctionPgsoHotInCallGraph but for +/// MachineFunction. +static bool isFunctionPgsoHotInCallGraph( + const MachineFunction *MF, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo &MBFI) { + if (auto FunctionCount = MF->getFunction().getEntryCount()) + if (PSI->isPgsoHotCount(FunctionCount.getCount())) + return true; + for (const auto &MBB : *MF) + if (isPgsoHotBlock(&MBB, PSI, &MBFI)) + return true; + return false; +} + +bool llvm::shouldOptimizeForSize(const MachineFunction *MF, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { + assert(MF); + if (!PSI || !MBFI || !PSI->hasProfileSummary()) + return false; + if (ForcePGSO) + return true; + if (!EnablePGSO) + return false; + if (PGSOHugeWorkingSetSizeOnly && !PSI->pgsoHasHugeWorkingSetSize()) + // Even if the working set size isn't huge, size-optimize cold code. + return isFunctionColdInCallGraph(MF, PSI, *MBFI); + return !isFunctionPgsoHotInCallGraph(MF, PSI, *MBFI); +} + +bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { + assert(MBB); + if (!PSI || !MBFI || !PSI->hasProfileSummary()) + return false; + if (ForcePGSO) + return true; + if (!EnablePGSO) + return false; + if (PGSOHugeWorkingSetSizeOnly && !PSI->pgsoHasHugeWorkingSetSize()) + // Even if the working set size isn't huge, size-optimize cold code. + return isColdBlock(MBB, PSI, MBFI); + return !isPgsoHotBlock(MBB, PSI, MBFI); +} Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -71,6 +71,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -156,6 +158,8 @@ MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree MachineLoopInfo *MLI; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; public: static char ID; // Pass identification @@ -175,6 +179,8 @@ AU.addRequired(); AU.addPreserved(); } + AU.addRequired(); + AU.addRequired(); } /// Track Def -> Use info used for rewriting copies. @@ -1580,7 +1586,7 @@ auto CP = RI.getCommutePair(); if (CP) { Changed = true; - TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + TII->commuteInstruction(*(RI.getMI()), PSI, MBFI, false, (*CP).first, (*CP).second); LLVM_DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); } @@ -1605,6 +1611,10 @@ MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis() : nullptr; MLI = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; bool Changed = false; @@ -1767,7 +1777,8 @@ unsigned FoldedReg = FoldAsLoadDefReg; MachineInstr *DefMI = nullptr; if (MachineInstr *FoldMI = - TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) { + TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI, + PSI, MBFI)) { // Update LocalMIs since we replaced MI with FoldMI and deleted // DefMI. LLVM_DEBUG(dbgs() << "Replacing: " << *MI); Index: lib/CodeGen/RegAllocBasic.cpp =================================================================== --- lib/CodeGen/RegAllocBasic.cpp +++ lib/CodeGen/RegAllocBasic.cpp @@ -16,6 +16,7 @@ #include "RegAllocBase.h" #include "Spiller.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -190,6 +191,7 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(AU); } Index: lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- lib/CodeGen/RegAllocGreedy.cpp +++ lib/CodeGen/RegAllocGreedy.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/LiveInterval.h" @@ -626,6 +627,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(AU); } @@ -3239,6 +3241,7 @@ SpillPlacer = &getAnalysis(); DebugVars = &getAnalysis(); AA = &getAnalysis().getAAResults(); + auto PSI = &getAnalysis().getPSI(); initializeCSRCost(); @@ -3247,7 +3250,7 @@ LLVM_DEBUG(LIS->dump()); SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); - SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI)); + SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, PSI)); ExtraRegInfo.clear(); ExtraRegInfo.resize(MRI->getNumVirtRegs()); NextCascade = 1; Index: lib/CodeGen/RegAllocPBQP.cpp =================================================================== --- lib/CodeGen/RegAllocPBQP.cpp +++ lib/CodeGen/RegAllocPBQP.cpp @@ -40,6 +40,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -549,6 +550,7 @@ au.addPreserved(); au.addRequired(); au.addPreserved(); + au.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(au); } Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -21,6 +21,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -129,6 +131,8 @@ const MachineLoopInfo* Loops; AliasAnalysis *AA; RegisterClassInfo RegClassInfo; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; /// A LaneMask to remember on which subregister live ranges we need to call /// shrinkToUses() later. @@ -538,13 +542,15 @@ AU.addRequired(); AU.addPreserved(); AU.addPreservedID(MachineDominatorsID); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } void RegisterCoalescer::eliminateDeadDefs() { SmallVector NewRegs; LiveRangeEdit(nullptr, NewRegs, *MF, *LIS, - nullptr, this).eliminateDeadDefs(DeadDefs); + nullptr, this).eliminateDeadDefs(DeadDefs, PSI, MBFI); } void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) { @@ -832,7 +838,7 @@ // transformation. Start by commuting the instruction. MachineBasicBlock *MBB = DefMI->getParent(); MachineInstr *NewMI = - TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx); + TII->commuteInstruction(*DefMI, PSI, MBFI, false, UseOpIdx, NewDstIdx); if (!NewMI) return { false, false }; if (Register::isVirtualRegister(IntA.reg) && @@ -3686,6 +3692,10 @@ JoinGlobalCopies = STI.enableJoinGlobalCopies(); else JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // The MachineScheduler does not currently require JoinSplitEdges. This will // either be enabled unconditionally or replaced by a more general live range Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -217,7 +217,7 @@ DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { - ForCodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + ForCodeSize = DAG.shouldOptForSize(); MaximumLegalStoreInBits = 0; for (MVT VT : MVT::all_valuetypes()) @@ -12847,7 +12847,7 @@ // Assume that libcalls are the smallest code. // TODO: This restriction should probably be lifted for vectors. - if (DAG.getMachineFunction().getFunction().hasOptSize()) + if (ForCodeSize) return SDValue(); // pow(X, 0.25) --> sqrt(sqrt(X)) Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -24,6 +24,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -63,6 +65,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -1005,7 +1008,9 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - LegacyDivergenceAnalysis * Divergence) { + LegacyDivergenceAnalysis * Divergence, + ProfileSummaryInfo *PSIin, + BlockFrequencyInfo *BFIin) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -1014,6 +1019,8 @@ LibInfo = LibraryInfo; Context = &MF->getFunction().getContext(); DA = Divergence; + PSI = PSIin; + BFI = BFIin; } SelectionDAG::~SelectionDAG() { @@ -1023,6 +1030,11 @@ delete DbgInfo; } +bool SelectionDAG::shouldOptForSize() const { + return MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(FLI->MBB->getBasicBlock(), PSI, BFI); +} + void SelectionDAG::allnodes_clear() { assert(&*AllNodes.begin() == &EntryNode); AllNodes.remove(AllNodes.begin()); @@ -1425,7 +1437,7 @@ assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) - Alignment = MF->getFunction().hasOptSize() + Alignment = shouldOptForSize() ? getDataLayout().getABITypeAlignment(C->getType()) : getDataLayout().getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; @@ -5733,12 +5745,13 @@ SrcDelta + G->getOffset()); } -static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { +static bool shouldLowerMemFuncForSize(const MachineFunction &MF, + SelectionDAG &DAG) { // On Darwin, -Os means optimize for size without hurting performance, so // only really optimize for size when -Oz (MinSize) is used. if (MF.getTarget().getTargetTriple().isOSDarwin()) return MF.getFunction().hasMinSize(); - return MF.getFunction().hasOptSize(); + return DAG.shouldOptForSize(); } static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl, @@ -5788,7 +5801,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -5972,7 +5985,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -6078,7 +6091,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -28,10 +28,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -5286,8 +5288,8 @@ if (Val == 0) return DAG.getConstantFP(1.0, DL, LHS.getValueType()); - const Function &F = DAG.getMachineFunction().getFunction(); - if (!F.hasOptSize() || + bool OptForSize = DAG.shouldOptForSize(); + if (!OptForSize || // If optimizing for size, don't insert too many multiplies. // This inserts up to 5 multiplies. countPopulation(Val) + Log2_32(Val) < 7) { @@ -10493,7 +10495,7 @@ return; } - SL->findJumpTables(Clusters, &SI, DefaultMBB); + SL->findJumpTables(Clusters, &SI, DefaultMBB, DAG.getPSI(), DAG.getBFI()); SL->findBitTestClusters(Clusters, &SI); LLVM_DEBUG({ Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -27,7 +27,9 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/FastISel.h" @@ -339,6 +341,8 @@ AU.addRequired(); if (UseMBPI && OptLevel != CodeGenOpt::None) AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -441,13 +445,17 @@ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable(); LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable()); + getAnalysisIfAvailable(), PSI, BFI); FuncInfo->set(Fn, *MF, CurDAG); SwiftError->setFunction(*MF); Index: lib/CodeGen/SplitKit.h =================================================================== --- lib/CodeGen/SplitKit.h +++ lib/CodeGen/SplitKit.h @@ -36,6 +36,7 @@ class LiveIntervals; class LiveRangeEdit; +class ProfileSummaryInfo; class MachineBlockFrequencyInfo; class MachineDominatorTree; class MachineLoopInfo; @@ -264,6 +265,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; public: /// ComplementSpillMode - Select how the complement live range should be @@ -444,7 +446,8 @@ /// Newly created intervals will be appended to newIntervals. SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis, VirtRegMap &vrm, MachineDominatorTree &mdt, - MachineBlockFrequencyInfo &mbfi); + MachineBlockFrequencyInfo &mbfi, + ProfileSummaryInfo *PSI); /// reset - Prepare for a new split. void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition); Index: lib/CodeGen/SplitKit.cpp =================================================================== --- lib/CodeGen/SplitKit.cpp +++ lib/CodeGen/SplitKit.cpp @@ -365,12 +365,13 @@ SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis, VirtRegMap &vrm, MachineDominatorTree &mdt, - MachineBlockFrequencyInfo &mbfi) + MachineBlockFrequencyInfo &mbfi, + ProfileSummaryInfo *PSI) : SA(sa), AA(aa), LIS(lis), VRM(vrm), MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt), TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()), TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()), - MBFI(mbfi), RegAssign(Allocator) {} + MBFI(mbfi), PSI(PSI), RegAssign(Allocator) {} void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) { Edit = &LRE; @@ -1429,7 +1430,7 @@ if (Dead.empty()) return; - Edit->eliminateDeadDefs(Dead, None, &AA); + Edit->eliminateDeadDefs(Dead, PSI, &MBFI, None, &AA); } void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) { Index: lib/CodeGen/SwitchLoweringUtils.cpp =================================================================== --- lib/CodeGen/SwitchLoweringUtils.cpp +++ lib/CodeGen/SwitchLoweringUtils.cpp @@ -40,9 +40,12 @@ return NumCases; } -void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters, - const SwitchInst *SI, - MachineBasicBlock *DefaultMBB) { +void SwitchCG::SwitchLowering::findJumpTables( + CaseClusterVector &Clusters, + const SwitchInst *SI, + MachineBasicBlock *DefaultMBB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { #ifndef NDEBUG // Clusters must be non-empty, sorted, and only contain Range clusters. assert(!Clusters.empty()); @@ -80,7 +83,7 @@ assert(Range >= NumCases); // Cheap case: the whole range may be suitable for jump table. - if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) { + if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) { CaseCluster JTCluster; if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) { Clusters[0] = JTCluster; @@ -138,7 +141,7 @@ assert(NumCases < UINT64_MAX / 100); assert(Range >= NumCases); - if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) { + if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) { unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]); unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1]; int64_t NumEntries = j - i + 1; Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -12,6 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -37,6 +39,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -74,7 +78,11 @@ return false; auto MBPI = &getAnalysis(); - Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false); + auto *PSI = &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks()) Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -19,13 +19,16 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -77,6 +80,8 @@ void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPIin, + const MachineBlockFrequencyInfo *MBFIin, + ProfileSummaryInfo *PSIin, bool LayoutModeIn, unsigned TailDupSizeIn) { MF = &MFin; TII = MF->getSubtarget().getInstrInfo(); @@ -84,6 +89,8 @@ MRI = &MF->getRegInfo(); MMI = &MF->getMMI(); MBPI = MBPIin; + MBFI = MBFIin; + PSI = PSIin; TailDupSize = TailDupSizeIn; assert(MBPI != nullptr && "Machine Branch Probability Info required"); @@ -555,14 +562,14 @@ // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; - if (TailDupSize == 0 && - TailDuplicateSize.getNumOccurrences() == 0 && - MF->getFunction().hasOptSize()) - MaxDuplicateCount = 1; - else if (TailDupSize == 0) + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI); + if (TailDupSize == 0) MaxDuplicateCount = TailDuplicateSize; else MaxDuplicateCount = TailDupSize; + if (OptForSize) + MaxDuplicateCount = 1; // If the block to be duplicated ends in an unanalyzable fallthrough, don't // duplicate it. Index: lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- lib/CodeGen/TargetInstrInfo.cpp +++ lib/CodeGen/TargetInstrInfo.cpp @@ -154,6 +154,8 @@ } MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned Idx1, unsigned Idx2) const { const MCInstrDesc &MCID = MI.getDesc(); @@ -236,7 +238,10 @@ return CommutedMI; } -MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI, +MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose @@ -248,7 +253,7 @@ "Precondition violation: MI must be commutable."); return nullptr; } - return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, OpIdx2); } bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1, @@ -530,6 +535,8 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, int FI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { auto Flags = MachineMemOperand::MONone; @@ -577,7 +584,7 @@ MBB->insert(MI, NewMI); } else { // Ask the target to do the actual folding. - NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM); + NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, PSI, MBFI, LIS, VRM); } if (NewMI) { @@ -619,6 +626,8 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!"); #ifndef NDEBUG @@ -643,7 +652,7 @@ NewMI = &*MBB.insert(MI, NewMI); } else { // Ask the target to do the actual folding. - NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS); + NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, PSI, MBFI, LIS); } if (!NewMI) Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -33,6 +33,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" @@ -99,6 +101,8 @@ LiveIntervals *LIS; AliasAnalysis *AA; CodeGenOpt::Level OptLevel; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; // The current basic block being processed. MachineBasicBlock *MBB; @@ -188,6 +192,8 @@ AU.addPreserved(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -684,7 +690,8 @@ unsigned Dist) { Register RegC = MI->getOperand(RegCIdx).getReg(); LLVM_DEBUG(dbgs() << "2addr: COMMUTING : " << *MI); - MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx); + MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI, false, RegBIdx, + RegCIdx); if (NewMI == nullptr) { LLVM_DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n"); @@ -1681,6 +1688,10 @@ // fixups are necessary for correctness. if (skipFunction(Func.getFunction())) OptLevel = CodeGenOpt::None; + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; bool MadeChange = false; Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -163,6 +163,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3104,6 +3104,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { // This is a bit of a hack. Consider this instruction: // Index: lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- lib/Target/AMDGPU/GCNDPPCombine.cpp +++ lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -458,7 +458,7 @@ auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); - if (TII->commuteInstruction(*NewMI)) { + if (TII->commuteInstruction(*NewMI, nullptr, nullptr)) { LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -269,7 +269,7 @@ MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); if (Fold.isCommuted()) - TII.commuteInstruction(*Inst32, false); + TII.commuteInstruction(*Inst32, nullptr, nullptr, false); return true; } @@ -370,7 +370,8 @@ return false; if (!CanCommute || - !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) + !TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0, + CommuteIdx1)) return false; if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { @@ -399,7 +400,8 @@ return true; } - TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); + TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0, + CommuteIdx1); return false; } @@ -1101,7 +1103,7 @@ tryFoldInst(TII, Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. - TII->commuteInstruction(*Fold.UseMI, false); + TII->commuteInstruction(*Fold.UseMI, nullptr, nullptr, false); } } } Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -371,7 +371,7 @@ MachineOperand &Op1 = A->getOperand(1); MachineOperand &Op2 = A->getOperand(2); if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { - TII->commuteInstruction(*A); + TII->commuteInstruction(*A, nullptr, nullptr); Changed = true; } if (Op1.getReg() != ExecReg) Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -137,7 +137,10 @@ MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1594,7 +1594,10 @@ return &MI; } -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned Src0Idx, unsigned Src1Idx) const { assert(!NewMI && "this should never be used"); @@ -1618,7 +1621,8 @@ if (isOperandLegal(MI, Src1Idx, &Src0)) { // Be sure to copy the source modifiers to the right place. CommutedMI - = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); + = TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, + Src0Idx, Src1Idx); } } else if (Src0.isReg() && !Src1.isReg()) { @@ -2383,7 +2387,7 @@ if (Def && Def->isMoveImmediate() && isInlineConstant(Def->getOperand(1)) && MRI->hasOneUse(Src1->getReg()) && - commuteInstruction(UseMI)) { + commuteInstruction(UseMI, nullptr, nullptr)) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); } else if ((Register::isPhysicalRegister(Src1->getReg()) && RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -113,12 +113,12 @@ // We have failed to fold src0, so commute the instruction and try again. if (TryToCommute && MI.isCommutable()) { - if (TII->commuteInstruction(MI)) { + if (TII->commuteInstruction(MI, nullptr, nullptr)) { if (foldImmediates(MI, TII, MRI, false)) return true; // Commute back. - TII->commuteInstruction(MI); + TII->commuteInstruction(MI, nullptr, nullptr); } } @@ -183,7 +183,7 @@ // cmpk instructions do scc = dst imm16, so commute the instruction to // get constants on the RHS. if (!MI.getOperand(0).isReg()) - TII->commuteInstruction(MI, false, 0, 1); + TII->commuteInstruction(MI, nullptr, nullptr, false, 0, 1); const MachineOperand &Src1 = MI.getOperand(1); if (!Src1.isImm()) @@ -355,7 +355,7 @@ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && SrcImm == Src0) { - if (!TII->commuteInstruction(MI, false, 1, 2)) + if (!TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2)) NewImm = 0; } @@ -634,7 +634,7 @@ MachineOperand *Src1 = &MI.getOperand(2); if (!Src0->isReg() && Src1->isReg()) { - if (TII->commuteInstruction(MI, false, 1, 2)) + if (TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2)) std::swap(Src0, Src1); } @@ -704,7 +704,8 @@ if (!TII->canShrink(MI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(MI) || + if (!MI.isCommutable() || + !TII->commuteInstruction(MI, nullptr, nullptr) || !TII->canShrink(MI, MRI)) continue; } Index: lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.h +++ lib/Target/ARM/ARMBaseInstrInfo.h @@ -96,7 +96,10 @@ /// non-commutable pair of operand indices OpIdx1 and OpIdx2. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; @@ -322,7 +325,9 @@ /// VFP/NEON execution domains. std::pair getExecutionDomain(const MachineInstr &MI) const override; - void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; + void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; unsigned getPartialRegUpdateClearance(const MachineInstr &, unsigned, Index: lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.cpp +++ lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2110,6 +2110,8 @@ } MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -2123,7 +2125,8 @@ if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; MachineInstr *CommutedMI = - TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); if (!CommutedMI) return nullptr; // After swapping the MOVCC operands, also invert the condition. @@ -2132,7 +2135,8 @@ return CommutedMI; } } - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); } /// Identify instructions that can be folded into a MOVCC instruction, and @@ -4890,7 +4894,9 @@ } void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI, - unsigned Domain) const { + unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { unsigned DstReg, SrcReg, DReg; unsigned Lane; MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); Index: lib/Target/ARM/Thumb2SizeReduction.cpp =================================================================== --- lib/Target/ARM/Thumb2SizeReduction.cpp +++ lib/Target/ARM/Thumb2SizeReduction.cpp @@ -758,7 +758,7 @@ if (Reg1 != Reg0) return false; // Try to commute the operands to make it a 2-address instruction. - MachineInstr *CommutedMI = TII->commuteInstruction(*MI); + MachineInstr *CommutedMI = TII->commuteInstruction(*MI, nullptr, nullptr); if (!CommutedMI) return false; } @@ -770,7 +770,8 @@ MI->getOperand(CommOpIdx2).getReg() != Reg0) return false; MachineInstr *CommutedMI = - TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2); + TII->commuteInstruction(*MI, nullptr, nullptr, false, + CommOpIdx1, CommOpIdx2); if (!CommutedMI) return false; } Index: lib/Target/PowerPC/PPCInstrInfo.h =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.h +++ lib/Target/PowerPC/PPCInstrInfo.h @@ -171,7 +171,10 @@ /// /// For example, we can commute rlwimi instructions, but only if the /// rotate amt is zero. We also have to munge the immediates a bit. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; Index: lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.cpp +++ lib/Target/PowerPC/PPCInstrInfo.cpp @@ -365,14 +365,18 @@ return 0; } -MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { MachineFunction &MF = *MI.getParent()->getParent(); // Normal instructions can be commuted the obvious way. if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo) - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because // changing the relative order of the mask operands might change what happens Index: lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.h +++ lib/Target/SystemZ/SystemZInstrInfo.h @@ -188,7 +188,10 @@ /// non-commutable operands. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override; @@ -255,11 +258,15 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const override; bool expandPostRAPseudo(MachineInstr &MBBI) const override; bool reverseBranchCondition(SmallVectorImpl &Cond) const Index: lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.cpp +++ lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -259,7 +259,7 @@ // If the destination (now) matches one source, prefer this to be first. if (DestReg != Src1Reg && DestReg == Src2Reg) { - commuteInstruction(MI, false, 1, 2); + commuteInstruction(MI, nullptr, nullptr, false, 1, 2); std::swap(Src1Reg, Src2Reg); std::swap(Src1IsHigh, Src2IsHigh); } @@ -361,6 +361,8 @@ } MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -384,11 +386,13 @@ unsigned CCValid = WorkingMI.getOperand(3).getImm(); unsigned CCMask = WorkingMI.getOperand(4).getImm(); WorkingMI.getOperand(4).setImm(CCMask ^ CCValid); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } default: - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, + NewMI, OpIdx1, OpIdx2); } } @@ -746,7 +750,7 @@ } if (CommuteIdx != -1) - if (!commuteInstruction(UseMI, false, CommuteIdx, UseIdx)) + if (!commuteInstruction(UseMI, nullptr, nullptr, false, CommuteIdx, UseIdx)) return false; bool DeleteDef = MRI->hasOneNonDBGUse(Reg); @@ -1088,6 +1092,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1301,6 +1306,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { return nullptr; } Index: lib/Target/SystemZ/SystemZShortenInst.cpp =================================================================== --- lib/Target/SystemZ/SystemZShortenInst.cpp +++ lib/Target/SystemZ/SystemZShortenInst.cpp @@ -185,7 +185,7 @@ return true; } if (MI.getOperand(0).getReg() == MI.getOperand(2).getReg()) { - TII->commuteInstruction(MI, false, 1, 2); + TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2); MI.setDesc(TII->get(Opcode)); MI.tieOperands(0, 1); return true; @@ -338,7 +338,7 @@ if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) && (!MI.isCommutable() || MI.getOperand(0).getReg() != MI.getOperand(2).getReg() || - !TII->commuteInstruction(MI, false, 1, 2))) + !TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2))) break; MI.setDesc(TII->get(TwoOperandOpcode)); Index: lib/Target/WebAssembly/WebAssemblyInstrInfo.h =================================================================== --- lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -48,7 +48,10 @@ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; Index: lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp =================================================================== --- lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -85,7 +85,9 @@ } MachineInstr *WebAssemblyInstrInfo::commuteInstructionImpl( - MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { + MachineInstr &MI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { // If the operands are stackified, we can't reorder them. WebAssemblyFunctionInfo &MFI = *MI.getParent()->getParent()->getInfo(); @@ -94,7 +96,8 @@ return nullptr; // Otherwise use the default implementation. - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, + OpIdx1, OpIdx2); } // Branch analysis. Index: lib/Target/WebAssembly/WebAssemblyRegStackify.cpp =================================================================== --- lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -740,7 +740,8 @@ assert(!Declined && "Don't decline commuting until you've finished trying it"); // Commuting didn't help. Revert it. - TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); + TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false, + Operand0, Operand1); TentativelyCommuting = false; Declined = true; } else if (!Declined && TreeWalker.hasRemainingOperands(Insert)) { @@ -748,7 +749,8 @@ Operand1 = TargetInstrInfo::CommuteAnyOperandIndex; if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) { // Tentatively commute the operands and try again. - TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); + TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false, + Operand0, Operand1); TreeWalker.resetTopOperands(Insert); TentativelyCommuting = true; Declined = false; Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -179,6 +179,11 @@ bool Op0IsKill, unsigned Op1, bool Op1IsKill, unsigned Op2, bool Op2IsKill, unsigned Op3, bool Op3IsKill); + + bool shouldOptForSize(const MachineFunction *MF) const { + // TODO: Implement PGSO. + return MF->getFunction().hasOptSize(); + } }; } // end anonymous namespace. @@ -3935,7 +3940,7 @@ MachineInstr *Result = XII.foldMemoryOperandImpl( *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment, - /*AllowCommute=*/true); + /*AllowCommute=*/true, nullptr, nullptr); if (!Result) return false; Index: lib/Target/X86/X86FixupBWInsts.cpp =================================================================== --- lib/Target/X86/X86FixupBWInsts.cpp +++ lib/Target/X86/X86FixupBWInsts.cpp @@ -48,11 +48,14 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" @@ -107,6 +110,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); // Machine loop info is used to // guide some heuristics. + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -134,6 +139,9 @@ /// Register Liveness information after the current instruction. LivePhysRegs LiveRegs; + + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; }; char FixupBWInstPass::ID = 0; } @@ -148,8 +156,11 @@ this->MF = &MF; TII = MF.getSubtarget().getInstrInfo(); - OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LiveRegs.init(TII->getRegisterInfo()); LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";); @@ -384,6 +395,9 @@ // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); + OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -330,7 +330,7 @@ // Do not want to hoist if we're not optimizing for size. // TODO: We'd like to remove this restriction. // See the comment in X86InstrInfo.td for more info. - if (!OptForSize) + if (!CurDAG->shouldOptForSize()) return false; // Walk all the users of the immediate. @@ -2999,7 +2999,7 @@ LLVM_FALLTHROUGH; case X86ISD::ADD: // Try to match inc/dec. - if (!Subtarget->slowIncDec() || OptForSize) { + if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -8172,7 +8174,7 @@ // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -11067,7 +11069,7 @@ case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -18138,7 +18140,7 @@ "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -19442,7 +19444,7 @@ /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -20128,7 +20130,7 @@ } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -39506,7 +39508,7 @@ return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -24,7 +24,9 @@ #include "X86GenInstrInfo.inc" namespace llvm { +class MachineBlockFrequencyInfo; class MachineInstrBuilder; +class ProfileSummaryInfo; class X86RegisterInfo; class X86Subtarget; @@ -338,6 +340,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; @@ -347,6 +351,7 @@ MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or @@ -424,9 +429,13 @@ uint16_t getExecutionDomainCustom(const MachineInstr &MI) const; - void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; - - bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const; + void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; + + bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const; unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, @@ -441,7 +450,9 @@ ArrayRef MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, unsigned Alignment, - bool AllowCommute) const; + bool AllowCommute, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const; bool isHighLatencyDef(int opc) const override; @@ -487,7 +498,9 @@ MachineInstr *optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const override; + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; @@ -534,7 +547,10 @@ /// non-commutable operands. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -19,14 +19,17 @@ #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -1495,7 +1498,10 @@ #undef VPERM_CASES } -MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & { @@ -1526,7 +1532,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.getOperand(3).setImm(Size - Amt); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PFSUBrr: @@ -1537,15 +1544,20 @@ (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::VBLENDPDrri: - case X86::VBLENDPSrri: + case X86::VBLENDPSrri: { // If we're optimizing for size, try to use MOVSD/MOVSS. - if (MI.getParent()->getParent()->getFunction().hasOptSize()) { + auto *MBB = MI.getParent(); + auto MF = MBB->getParent(); + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); @@ -1559,11 +1571,13 @@ WorkingMI.setDesc(get(Opc)); WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, + PSI, MBFI, /*NewMI=*/false, OpIdx1, OpIdx2); } } LLVM_FALLTHROUGH; + } case X86::PBLENDWrri: case X86::VBLENDPDYrri: case X86::VBLENDPSYrri: @@ -1592,7 +1606,8 @@ int8_t Imm = MI.getOperand(3).getImm() & Mask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::INSERTPSrr: @@ -1612,7 +1627,8 @@ unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } return nullptr; @@ -1635,7 +1651,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1646,7 +1663,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::SHUFPDrri)); WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::SHUFPDrri: { @@ -1655,7 +1673,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::MOVSDrr)); WorkingMI.RemoveOperand(3); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: @@ -1671,7 +1690,8 @@ unsigned Src2Hi = Imm & 0x10; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: @@ -1703,7 +1723,8 @@ Imm = X86::getSwappedVPCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCOMBri: case X86::VPCOMUBri: @@ -1715,7 +1736,8 @@ Imm = X86::getSwappedVPCOMImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPERM2F128rr: @@ -1726,7 +1748,8 @@ int8_t Imm = MI.getOperand(3).getImm() & 0xFF; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm ^ 0x22); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::MOVHLPSrr: @@ -1749,7 +1772,8 @@ } auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { @@ -1757,7 +1781,8 @@ unsigned OpNo = MI.getDesc().getNumOperands() - 1; X86::CondCode CC = static_cast(MI.getOperand(OpNo).getImm()); WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: @@ -1792,7 +1817,8 @@ case X86::VPTERNLOGQZrmbikz: { auto &WorkingMI = cloneIfNew(MI); commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } default: { @@ -1800,7 +1826,8 @@ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1811,11 +1838,13 @@ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, + NewMI, OpIdx1, OpIdx2); } } } @@ -3758,7 +3787,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); @@ -3784,7 +3815,8 @@ return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI, + PSI, MBFI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4736,7 +4768,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align, bool AllowCommute) const { + unsigned Size, unsigned Align, bool AllowCommute, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI) const { bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; @@ -4869,7 +4902,7 @@ return nullptr; MachineInstr *CommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2); if (!CommutedMI) { // Unable to commute. return nullptr; @@ -4882,13 +4915,14 @@ // Attempt to fold with the commuted version of the instruction. NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, - Size, Align, /*AllowCommute=*/false); + Size, Align, /*AllowCommute=*/false, + PSI, MBFI); if (NewMI) return NewMI; // Folding failed again - undo the commute before returning. MachineInstr *UncommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2); if (!UncommutedMI) { // Unable to commute. return nullptr; @@ -4914,7 +4948,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, - int FrameIndex, LiveIntervals *LIS, + int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + LiveIntervals *LIS, VirtRegMap *VRM) const { // Check switch flag if (NoFusing) @@ -4964,7 +5001,8 @@ return foldMemoryOperandImpl(MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, - Size, Alignment, /*AllowCommute=*/true); + Size, Alignment, /*AllowCommute=*/true, + PSI, MBFI); } /// Check if \p LoadMI is a partial register load that we can't fold into \p MI @@ -5105,6 +5143,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { // TODO: Support the case where LoadMI loads a wide register, but MI @@ -5120,7 +5159,8 @@ if (isLoadFromStackSlot(LoadMI, FrameIndex)) { if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; - return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS); + return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, PSI, MBFI, + LIS); } // Check switch flag @@ -5266,7 +5306,8 @@ } } return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt, - /*Size=*/0, Alignment, /*AllowCommute=*/true); + /*Size=*/0, Alignment, /*AllowCommute=*/true, + PSI, MBFI); } static SmallVector @@ -6593,7 +6634,9 @@ } bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, - unsigned Domain) const { + unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { assert(Domain > 0 && Domain < 4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); @@ -6702,7 +6745,7 @@ MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0) { - commuteInstruction(MI, false); + commuteInstruction(MI, PSI, MBFI, false); return true; } // We must always return true for MOVHLPSrr. @@ -6765,13 +6808,15 @@ return std::make_pair(domain, validDomains); } -void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { +void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); // Attempt to match for custom instructions. - if (setExecutionDomainCustom(MI, Domain)) + if (setExecutionDomainCustom(MI, Domain, PSI, MBFI)) return; const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -986,12 +986,12 @@ // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction().hasOptSize()">; + def OptForSize : Predicate<"shouldOptForSize(MF)">; def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">; + def OptForSpeed : Predicate<"!shouldOptForSize(MF)">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " - "MF->getFunction().hasOptSize()">; - def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || " + "shouldOptForSize(MF)">; + def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || " "!Subtarget->hasSSE41()">; } Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -25,6 +25,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -32,6 +34,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -247,6 +250,12 @@ static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: using MemOpMap = DenseMap>; @@ -681,6 +690,11 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // Process all basic blocks. for (auto &MBB : MF) { @@ -699,7 +713,9 @@ // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + if (OptForSize) Changed |= removeRedundantAddrCalc(LEAs); } Index: lib/Target/X86/X86PadShortFunction.cpp =================================================================== --- lib/Target/X86/X86PadShortFunction.cpp +++ lib/Target/X86/X86PadShortFunction.cpp @@ -17,8 +17,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Function.h" @@ -52,6 +55,12 @@ bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -105,6 +114,12 @@ TSM.init(&MF.getSubtarget()); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); @@ -118,6 +133,11 @@ MachineBasicBlock *MBB = I->first; unsigned Cycles = I->second; + // Function::hasOptSize is already checked above. + bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) + continue; + if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions // trailing the terminator. Index: lib/Transforms/Utils/SizeOpts.cpp =================================================================== --- lib/Transforms/Utils/SizeOpts.cpp +++ lib/Transforms/Utils/SizeOpts.cpp @@ -14,24 +14,48 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SizeOpts.h" + using namespace llvm; -static cl::opt ProfileGuidedSizeOpt( +cl::opt EnablePGSO( "pgso", cl::Hidden, cl::init(true), - cl::desc("Enable the profile guided size optimization. ")); + cl::desc("Enable the profile guided size optimizations. ")); + +cl::opt PGSOHugeWorkingSetSizeOnly( + "pgso-hwss-only", cl::Hidden, cl::init(true), + cl::desc("Apply the profile guided size optimizations only " + "if the working set size is huge (except for cold code.)")); + +cl::opt ForcePGSO( + "force-pgso", cl::Hidden, cl::init(false), + cl::desc("Force the (profiled-guided) size optimizations. ")); -bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, +bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { assert(F); if (!PSI || !BFI || !PSI->hasProfileSummary()) return false; - return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI); + if (ForcePGSO) + return true; + if (!EnablePGSO) + return false; + if (PGSOHugeWorkingSetSizeOnly && !PSI->pgsoHasHugeWorkingSetSize()) + // Even if the working set size isn't huge, size-optimize cold code. + return PSI->isFunctionColdInCallGraph(F, *BFI); + return !PSI->isFunctionPgsoHotInCallGraph(F, *BFI); } -bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, +bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { assert(BB); if (!PSI || !BFI || !PSI->hasProfileSummary()) return false; - return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI); + if (ForcePGSO) + return true; + if (!EnablePGSO) + return false; + if (PGSOHugeWorkingSetSizeOnly && !PSI->pgsoHasHugeWorkingSetSize()) + // Even if the working set size isn't huge, size-optimize cold code. + return PSI->isColdBlock(BB, BFI); + return !PSI->isPgsoHotBlock(BB, BFI); } Index: test/CodeGen/AArch64/O0-pipeline.ll =================================================================== --- test/CodeGen/AArch64/O0-pipeline.ll +++ test/CodeGen/AArch64/O0-pipeline.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -44,13 +45,17 @@ ; CHECK-NEXT: Analysis for ComputingKnownBits ; CHECK-NEXT: InstructionSelect ; CHECK-NEXT: ResetMachineFunction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Post-RA pseudo instruction expansion pass Index: test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- test/CodeGen/AArch64/O3-pipeline.ll +++ test/CodeGen/AArch64/O3-pipeline.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -35,6 +35,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -76,10 +79,13 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -91,6 +97,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: AArch64 Conditional Compares +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: AArch64 Conditional Branch Tuning ; CHECK-NEXT: Machine Trace Metrics @@ -105,6 +112,7 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: AArch64 Dead register definitions @@ -113,6 +121,7 @@ ; CHECK-NEXT: Remove unreachable machine basic blocks ; CHECK-NEXT: Live Variable Analysis ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis @@ -126,7 +135,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -145,6 +153,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass Index: test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll =================================================================== --- test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll +++ test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll @@ -18,7 +18,6 @@ ; GreedyRegAlloc, please adjust accordingly.) ; HOTNESS: Executing Pass 'Spill Code Placement Analysis' -; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis' ; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter' ; HOTNESS-NEXT: MachineBlockFrequencyInfo is available ; HOTNESS-NEXT: Executing Pass 'Greedy Register Allocator' Index: test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- test/CodeGen/ARM/O3-pipeline.ll +++ test/CodeGen/ARM/O3-pipeline.ll @@ -19,6 +19,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -60,8 +63,11 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: ARM Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -75,6 +81,7 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: ARM MLA / MLS expansion pass @@ -87,6 +94,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis @@ -100,7 +108,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -115,11 +122,13 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: ARM load / store optimization pass ; CHECK-NEXT: ReachingDefAnalysis +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: ARM Execution Domain Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: ARM pseudo instruction expansion pass Index: test/CodeGen/X86/O0-pipeline.ll =================================================================== --- test/CodeGen/X86/O0-pipeline.ll +++ test/CodeGen/X86/O0-pipeline.ll @@ -14,6 +14,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -36,6 +37,10 @@ ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions @@ -45,11 +50,11 @@ ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Post-RA pseudo instruction expansion pass Index: test/CodeGen/X86/O3-pipeline.ll =================================================================== --- test/CodeGen/X86/O3-pipeline.ll +++ test/CodeGen/X86/O3-pipeline.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -32,6 +32,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -63,12 +66,15 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -79,6 +85,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Early If-Conversion +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: X86 cmov Conversion ; CHECK-NEXT: MachineDominator Tree Construction @@ -88,10 +95,12 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: Live Range Shrink ; CHECK-NEXT: X86 Fixup SetCC +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame ; CHECK-NEXT: X86 Avoid Store Forwarding Block @@ -106,6 +115,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis @@ -119,7 +129,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -136,6 +145,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass @@ -148,13 +158,16 @@ ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Branch Probability Basic Block Placement ; CHECK-NEXT: ReachingDefAnalysis +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Execution Dependency Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Byte/Word Instruction Fixup +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible Index: unittests/CodeGen/AArch64SelectionDAGTest.cpp =================================================================== --- unittests/CodeGen/AArch64SelectionDAGTest.cpp +++ unittests/CodeGen/AArch64SelectionDAGTest.cpp @@ -66,7 +66,7 @@ if (!DAG) report_fatal_error("DAG?"); OptimizationRemarkEmitter ORE(F); - DAG->init(*MF, ORE, nullptr, nullptr, nullptr); + DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr); } LLVMContext Context; Index: utils/TableGen/GlobalISelEmitter.cpp =================================================================== --- utils/TableGen/GlobalISelEmitter.cpp +++ utils/TableGen/GlobalISelEmitter.cpp @@ -4870,6 +4870,14 @@ SubtargetFeatureInfo::emitComputeAvailableFeatures( Target.getName(), "InstructionSelector", "computeAvailableModuleFeatures", ModuleFeatures, OS); + + if (Target.getName() == "X86") { + // TODO: Implement PGSO. + OS << "static bool shouldOptForSize(const MachineFunction *MF) {\n"; + OS << " return MF->getFunction().hasOptSize();\n"; + OS << "}\n\n"; + } + SubtargetFeatureInfo::emitComputeAvailableFeatures( Target.getName(), "InstructionSelector", "computeAvailableFunctionFeatures", FunctionFeatures, OS,