diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -40,12 +40,14 @@ } class AssumptionCache; +class BlockFrequencyInfo; class BranchInst; class Function; class GlobalValue; class IntrinsicInst; class LoadInst; class Loop; +class ProfileSummaryInfo; class SCEV; class ScalarEvolution; class StoreInst; @@ -297,7 +299,9 @@ /// \p JTSize Set a jump table size only when \p SI is suitable for a jump /// table. unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) const; + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) const; /// Estimate the cost of a given IR user when lowered. /// @@ -1177,7 +1181,9 @@ const User *U) = 0; virtual int getMemcpyCost(const Instruction *I) = 0; virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) = 0; + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) = 0; virtual int getUserCost(const User *U, ArrayRef Operands) = 0; virtual bool hasBranchDivergence() = 0; @@ -1678,8 +1684,10 @@ return Impl.getMaxInterleaveFactor(VF); } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) override { - return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize); + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) override { + return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI); } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -114,7 +114,11 @@ } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) { + unsigned &JTSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + (void)PSI; + (void)BFI; JTSize = 0; return SI.getNumCases(); } diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -48,6 +48,7 @@ class GlobalValue; class GlobalVariable; class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineConstantPoolValue; class MachineDominatorTree; class MachineFunction; @@ -69,6 +70,7 @@ class MCTargetOptions; class MDNode; class Module; +class ProfileSummaryInfo; class raw_ostream; class RemarkStreamer; class StackMaps; @@ -108,6 +110,10 @@ /// Optimization remark emitter. MachineOptimizationRemarkEmitter *ORE; + MachineBlockFrequencyInfo *MBFI; + + ProfileSummaryInfo *PSI; + /// The symbol for the current function. This is recalculated at the beginning /// of each call to runOnMachineFunction(). MCSymbol *CurrentFnSym = nullptr; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -326,7 +326,9 @@ } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JumpTableSize) { + unsigned &JumpTableSize, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { /// Try to find the estimated number of clusters. Note that the number of /// clusters identified in this function could be different from the actual /// numbers found in lowering. This function ignore switches that are @@ -374,7 +376,7 @@ (MaxCaseVal - MinCaseVal) .getLimitedValue(std::numeric_limits::max() - 1) + 1; // Check whether a range of clusters is dense enough for a jump table - if (TLI->isSuitableForJumpTable(&SI, N, Range)) { + if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { JumpTableSize = Range; return 1; } diff --git a/llvm/include/llvm/CodeGen/ExecutionDomainFix.h b/llvm/include/llvm/CodeGen/ExecutionDomainFix.h --- a/llvm/include/llvm/CodeGen/ExecutionDomainFix.h +++ b/llvm/include/llvm/CodeGen/ExecutionDomainFix.h @@ -23,6 +23,8 @@ #define LLVM_CODEGEN_EXECUTIONDOMAINFIX_H #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LoopTraversal.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -126,6 +128,9 @@ ReachingDefAnalysis *RDA; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; + public: ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC) : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} @@ -133,6 +138,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -34,6 +34,7 @@ namespace llvm { class LiveIntervals; +class ProfileSummaryInfo; class MachineBlockFrequencyInfo; class MachineInstr; class MachineLoopInfo; @@ -103,14 +104,17 @@ /// foldAsLoad - If LI has a single use and a single def that can be folded as /// a load, eliminate the register by folding the def into the use. - bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead); + bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI); using ToShrinkSet = SetVector, SmallPtrSet>; /// Helper for eliminateDeadDefs. void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, - AliasAnalysis *AA); + AliasAnalysis *AA, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI); /// MachineRegisterInfo callback to notify when new virtual /// registers are created. @@ -243,6 +247,8 @@ /// allocator. These registers should not be split into new intervals /// as currently those new intervals are not guaranteed to spill. void eliminateDeadDefs(SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, ArrayRef RegsBeingSpilled = None, AliasAnalysis *AA = nullptr); diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -58,6 +58,7 @@ class AAResults; class BlockAddress; +class BlockFrequencyInfo; class Constant; class ConstantFP; class ConstantInt; @@ -71,6 +72,7 @@ class MachineConstantPoolValue; class MCSymbol; class OptimizationRemarkEmitter; +class ProfileSummaryInfo; class SDDbgValue; class SDDbgLabel; class SelectionDAG; @@ -235,6 +237,9 @@ /// whenever manipulating the DAG. OptimizationRemarkEmitter *ORE; + ProfileSummaryInfo *PSI; + BlockFrequencyInfo *BFI; + /// The starting token. SDNode EntryNode; @@ -401,7 +406,8 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - LegacyDivergenceAnalysis * Divergence); + LegacyDivergenceAnalysis * Divergence, + ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin); void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { FLI = FuncInfo; @@ -423,6 +429,8 @@ const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; } LLVMContext *getContext() const {return Context; } OptimizationRemarkEmitter &getORE() const { return *ORE; } + ProfileSummaryInfo *getPSI() const { return PSI; } + BlockFrequencyInfo *getBFI() const { return BFI; } /// Pop up a GraphViz/gv window with the DAG rendered using 'dot'. void viewGraph(const std::string &Title); @@ -1711,6 +1719,8 @@ return It->second.HeapAllocSite; } + bool shouldOptForSize() const; + private: void InsertNode(SDNode *N); bool RemoveNodeFromCSEMaps(SDNode *N); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -39,6 +39,8 @@ class GCFunctionInfo; class ScheduleDAGSDNodes; class LoadInst; +class ProfileSummaryInfo; +class BlockFrequencyInfo; /// SelectionDAGISel - This is the common base class used for SelectionDAG-based /// pattern-matching instruction selectors. @@ -249,6 +251,11 @@ virtual StringRef getIncludePathForIndex(unsigned index) { llvm_unreachable("Tblgen should generate the implementation of this!"); } + + bool shouldOptForSize(const MachineFunction *MF) const { + return CurDAG->shouldOptForSize(); + } + public: // Calls to these predicates are generated by tblgen. bool CheckAndMask(SDValue LHS, ConstantSDNode *RHS, diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h --- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h +++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h @@ -19,6 +19,7 @@ class FunctionLoweringInfo; class MachineBasicBlock; +class BlockFrequencyInfo; namespace SwitchCG { @@ -264,7 +265,8 @@ std::vector BitTestCases; void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI, - MachineBasicBlock *DefaultMBB); + MachineBasicBlock *DefaultMBB, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); bool buildJumpTable(const CaseClusterVector &Clusters, unsigned First, unsigned Last, const SwitchInst *SI, @@ -295,4 +297,3 @@ } // namespace llvm #endif // LLVM_CODEGEN_SWITCHLOWERINGUTILS_H - diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h --- a/llvm/include/llvm/CodeGen/TailDuplicator.h +++ b/llvm/include/llvm/CodeGen/TailDuplicator.h @@ -25,11 +25,13 @@ namespace llvm { class MachineBasicBlock; +class MachineBlockFrequencyInfo; class MachineBranchProbabilityInfo; class MachineFunction; class MachineInstr; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class TargetRegisterInfo; /// Utility class to perform tail duplication. @@ -40,6 +42,8 @@ const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; MachineFunction *MF; + const MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; bool PreRegAlloc; bool LayoutMode; unsigned TailDupSize; @@ -65,6 +69,8 @@ /// default implies using the command line value TailDupSize. void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, + const MachineBlockFrequencyInfo *MBFI, + ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize = 0); bool tailDuplicateBlocks(); diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -44,12 +44,14 @@ class LiveIntervals; class LiveVariables; class MachineLoop; +class MachineBlockFrequencyInfo; class MachineMemOperand; class MachineRegisterInfo; class MCAsmInfo; class MCInst; struct MCSchedModel; class Module; +class ProfileSummaryInfo; class ScheduleDAG; class ScheduleHazardRecognizer; class SDNode; @@ -131,7 +133,10 @@ /// Do not call this method for a non-commutable instruction. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const; @@ -402,7 +407,9 @@ /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. MachineInstr * - commuteInstruction(MachineInstr &MI, bool NewMI = false, + commuteInstruction(MachineInstr &MI, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI = false, unsigned OpIdx1 = CommuteAnyOperandIndex, unsigned OpIdx2 = CommuteAnyOperandIndex) const; @@ -1001,6 +1008,8 @@ /// decide on using an opcode (note that those assignments can still change). MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, int FI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const; @@ -1008,6 +1017,8 @@ /// store from / to any address, not just from a specific stack slot. MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const; /// Return true when there is potentially a faster code sequence @@ -1092,6 +1103,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const { return nullptr; @@ -1105,6 +1118,7 @@ virtual MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const { return nullptr; } @@ -1391,7 +1405,9 @@ virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { return nullptr; } @@ -1513,7 +1529,9 @@ /// /// The bit (1 << Domain) must be set in the mask returned from /// getExecutionDomain(MI). - virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {} + virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const {} /// Returns the preferred minimum clearance /// before an instruction with an unwanted partial register update. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -53,6 +54,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -1030,13 +1032,16 @@ /// Return true if lowering to a jump table is suitable for a set of case /// clusters which may contain \p NumCases cases, \p Range range of values. virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, - uint64_t Range) const { + uint64_t Range, ProfileSummaryInfo* PSI, + BlockFrequencyInfo *BFI) const { // FIXME: This function check the maximum table size and density, but the // minimum size is not checked. It would be nice if the minimum size is // also combined within this function. Currently, the minimum size check is // performed in findJumpTable() in SelectionDAGBuiler and // getEstimatedNumberOfCaseClusters() in BasicTTIImpl. - const bool OptForSize = SI->getParent()->getParent()->hasOptSize(); + const bool OptForSize = SI->getParent()->getParent()->hasOptSize() || + llvm::shouldOptimizeForSize(SI->getParent(), PSI, + BFI); const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize); const unsigned MaxJumpTableSize = getMaximumJumpTableSize(); diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -1456,8 +1456,9 @@ int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; unsigned JumpTableSize = 0; + BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; unsigned NumCaseCluster = - TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); + TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); // If suitable for a jump table, consider the cost for the table size and // branch to destination. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -194,9 +194,10 @@ } unsigned -TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI, - unsigned &JTSize) const { - return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize); +TargetTransformInfo::getEstimatedNumberOfCaseClusters( + const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) const { + return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI); } int TargetTransformInfo::getUserCost(const User *U, diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -31,13 +31,16 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -52,6 +55,7 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -248,6 +252,8 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } bool AsmPrinter::doInitialization(Module &M) { @@ -1686,6 +1692,10 @@ } ORE = &getAnalysis().getORE(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; } namespace { @@ -2904,8 +2914,10 @@ void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB, MCCodePaddingContext &Context) const { assert(MF != nullptr && "Machine function must be valid"); + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); Context.IsPaddingActive = !MF->hasInlineAsm() && - !MF->getFunction().hasOptSize() && + !OptForSize && TM.getOptLevel() != CodeGenOpt::None; Context.IsBasicBlockReachableViaFallthrough = std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) != diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h --- a/llvm/lib/CodeGen/BranchFolding.h +++ b/llvm/lib/CodeGen/BranchFolding.h @@ -27,6 +27,7 @@ class MachineLoopInfo; class MachineModuleInfo; class MachineRegisterInfo; +class ProfileSummaryInfo; class raw_ostream; class TargetInstrInfo; class TargetRegisterInfo; @@ -39,6 +40,7 @@ bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, // Min tail length to merge. Defaults to commandline // flag. Ignored for optsize. unsigned MinTailLength = 0); @@ -145,6 +147,7 @@ const BlockFrequency Freq) const; void view(const Twine &Name, bool isSimple = true); uint64_t getEntryFreq() const; + const MachineBlockFrequencyInfo &getMBFI() { return MBFI; } private: const MachineBlockFrequencyInfo &MBFI; @@ -154,6 +157,7 @@ private: MBFIWrapper &MBBFreqInfo; const MachineBranchProbabilityInfo &MBPI; + ProfileSummaryInfo *PSI; bool TailMergeBlocks(MachineFunction &MF); bool TryTailMergeBlocks(MachineBasicBlock* SuccBB, diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -38,6 +39,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -102,6 +104,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -128,7 +131,8 @@ BranchFolder::MBFIWrapper MBBFreqInfo( getAnalysis()); BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo, - getAnalysis()); + getAnalysis(), + &getAnalysis().getPSI()); auto *MMIWP = getAnalysisIfAvailable(); return Folder.OptimizeFunction( MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(), @@ -138,9 +142,10 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, MBFIWrapper &FreqInfo, const MachineBranchProbabilityInfo &ProbInfo, + ProfileSummaryInfo *PSI, unsigned MinTailLength) : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength), - MBBFreqInfo(FreqInfo), MBPI(ProbInfo) { + MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) { if (MinCommonTailLength == 0) MinCommonTailLength = TailMergeSize; switch (FlagEnableTailMerge) { @@ -642,7 +647,9 @@ MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, DenseMap &EHScopeMembership, - bool AfterPlacement) { + bool AfterPlacement, + BranchFolder::MBFIWrapper &MBBFreqInfo, + ProfileSummaryInfo *PSI) { // It is never profitable to tail-merge blocks from two different EH scopes. if (!EHScopeMembership.empty()) { auto EHScope1 = EHScopeMembership.find(MBB1); @@ -728,7 +735,11 @@ // branch instruction, which is likely to be smaller than the 2 // instructions that would be deleted in the merge. MachineFunction *MF = MBB1->getParent(); - return EffectiveTailLen >= 2 && MF->getFunction().hasOptSize() && + bool OptForSize = + MF->getFunction().hasOptSize() || + (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) && + llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI())); + return EffectiveTailLen >= 2 && OptForSize && (I1 == MBB1->begin() || I2 == MBB2->begin()); } @@ -750,7 +761,7 @@ CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, EHScopeMembership, - AfterBlockPlacement)) { + AfterBlockPlacement, MBBFreqInfo, PSI)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; @@ -1580,8 +1591,10 @@ } } - if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && - MF.getFunction().hasOptSize()) { + bool OptForSize = + MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI()); + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) { // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch // direction, thereby defeating careful block placement and regressing // performance. Therefore, only consider this for optsize functions. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -251,6 +252,7 @@ const LoopInfo *LI; std::unique_ptr BFI; std::unique_ptr BPI; + ProfileSummaryInfo *PSI; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. @@ -293,7 +295,7 @@ /// Keep track of SExt promoted. ValueToSExts ValToSExtendedUses; - /// True if optimizing for size. + /// True if the function has the OptSize attribute. bool OptSize; /// DataLayout for the Function being processed. @@ -429,10 +431,8 @@ LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); + PSI = &getAnalysis().getPSI(); OptSize = F.hasOptSize(); - - ProfileSummaryInfo *PSI = - &getAnalysis().getPSI(); if (ProfileGuidedSectionPrefix) { if (PSI->isFunctionHotInCallGraph(&F, *BFI)) F.setSectionPrefix(".hot"); @@ -451,7 +451,9 @@ // bypassSlowDivision may create new BBs, but we don't want to reapply the // optimization to those blocks. BasicBlock* Next = BB->getNextNode(); - EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + // F.hasOptSize is already checked in the outer if statement. + if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get())) + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); BB = Next; } } @@ -1842,7 +1844,8 @@ // cold block. This interacts with our handling for loads and stores to // ensure that we can fold all uses of a potential addressing computation // into their uses. TODO: generalize this to work over profiling data - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get()); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) for (auto &Arg : CI->arg_operands()) { if (!Arg->getType()->isPointerTy()) continue; @@ -2777,16 +2780,24 @@ /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; + /// True if we are optimizing for size. + bool OptSize; + + ProfileSummaryInfo *PSI; + BlockFrequencyInfo *BFI; + AddressingModeMatcher( SmallVectorImpl &AMI, const TargetLowering &TLI, const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), - PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) { + PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), + OptSize(OptSize), PSI(PSI), BFI(BFI) { IgnoreProfitability = false; } @@ -2804,12 +2815,14 @@ const TargetLowering &TLI, const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, - std::pair, int64_t> &LargeOffsetGEP) { + std::pair, int64_t> &LargeOffsetGEP, + bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { ExtAddrMode Result; bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT, LargeOffsetGEP) + PromotedInsts, TPT, LargeOffsetGEP, + OptSize, PSI, BFI) .matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; @@ -4420,7 +4433,8 @@ Instruction *I, SmallVectorImpl> &MemoryUses, SmallPtrSetImpl &ConsideredInsts, const TargetLowering &TLI, - const TargetRegisterInfo &TRI, int SeenInsts = 0) { + const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, int SeenInsts = 0) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -4429,8 +4443,6 @@ if (!MightBeFoldableInst(I)) return true; - const bool OptSize = I->getFunction()->hasOptSize(); - // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { // Conservatively return true if we're seeing a large number or a deep chain @@ -4471,7 +4483,9 @@ if (CallInst *CI = dyn_cast(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst - if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + bool OptForSize = OptSize || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + if (!OptForSize && CI->hasFnAttr(Attribute::Cold)) continue; InlineAsm *IA = dyn_cast(CI->getCalledValue()); @@ -4483,8 +4497,8 @@ continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, - SeenInsts)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI, SeenInsts)) return true; } @@ -4572,7 +4586,8 @@ // the use is just a particularly nice way of sinking it. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -4608,7 +4623,7 @@ TPT.getRestorationPoint(); AddressingModeMatcher Matcher( MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); Matcher.IgnoreProfitability = true; bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -4714,7 +4729,8 @@ 0); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, - InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, + BFI.get()); GetElementPtrInst *GEP = LargeOffsetGEP.first; if (GEP && !NewGEPBases.count(GEP)) { @@ -5932,7 +5948,9 @@ /// turn it into a branch. bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // If branch conversion isn't desirable, exit early. - if (DisableSelectToBranch || OptSize || !TLI) + if (DisableSelectToBranch || + OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) || + !TLI) return false; // Find all consecutive select instructions that share the same condition. diff --git a/llvm/lib/CodeGen/ExecutionDomainFix.cpp b/llvm/lib/CodeGen/ExecutionDomainFix.cpp --- a/llvm/lib/CodeGen/ExecutionDomainFix.cpp +++ b/llvm/lib/CodeGen/ExecutionDomainFix.cpp @@ -114,7 +114,7 @@ // Collapse all the instructions. while (!dv->Instrs.empty()) - TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain); + TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain, PSI, MBFI); dv->setSingleDomain(domain); // If there are multiple users, give them new, unique DomainValues. @@ -319,7 +319,7 @@ // If the collapsed operands force a single domain, propagate the collapse. if (isPowerOf2_32(available)) { unsigned domain = countTrailingZeros(available); - TII->setExecutionDomain(*mi, domain); + TII->setExecutionDomain(*mi, domain, PSI, MBFI); visitHardInstr(mi, domain); return; } @@ -437,6 +437,11 @@ RDA = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + // Initialize the AliasMap on the first use. if (AliasMap.empty()) { // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -13,6 +13,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -20,6 +22,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Utils/SizeOpts.h" using namespace llvm; @@ -720,7 +723,8 @@ /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] /// ret i32 %phi.res static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { + const TargetLowering *TLI, const DataLayout *DL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { NumMemCmpCalls++; // Early exit from expansion if -Oz. @@ -741,18 +745,20 @@ // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(), + bool OptForSize = CI->getFunction()->hasOptSize() || + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp); if (!Options) return false; if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; - if (CI->getFunction()->hasOptSize() && + if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize; - if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences()) + if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmp; MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL); @@ -798,7 +804,11 @@ &getAnalysis().getTLI(F); const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI); return !PA.areAllPreserved(); } @@ -806,22 +816,26 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); FunctionPass::getAnalysisUsage(AU); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL); + const TargetLowering* TL, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI); // Returns true if a change was made. bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); + const DataLayout& DL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI); }; bool ExpandMemCmpPass::runOnBlock( BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { + const DataLayout& DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { for (Instruction& I : BB) { CallInst *CI = dyn_cast(&I); if (!CI) { @@ -830,7 +844,7 @@ LibFunc Func; if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && - expandMemCmp(CI, TTI, TL, &DL)) { + expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) { return true; } } @@ -840,11 +854,12 @@ PreservedAnalyses ExpandMemCmpPass::runImpl( Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { + const TargetLowering* TL, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { const DataLayout& DL = F.getParent()->getDataLayout(); bool MadeChanges = false; for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI)) { MadeChanges = true; // If changes were made, restart the function from the beginning, since // the structure of the function was changed. @@ -863,6 +878,8 @@ "Expand memcmp() to load/stores", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", "Expand memcmp() to load/stores", false, false) diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -466,7 +466,7 @@ return true; } - SL->findJumpTables(Clusters, &SI, DefaultMBB); + SL->findJumpTables(Clusters, &SI, DefaultMBB, nullptr, nullptr); LLVM_DEBUG({ dbgs() << "Case clusters: "; diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SparseSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -211,6 +212,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -432,6 +434,7 @@ INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { @@ -444,6 +447,8 @@ TRI = ST.getRegisterInfo(); BranchFolder::MBFIWrapper MBFI(getAnalysis()); MBPI = &getAnalysis(); + ProfileSummaryInfo *PSI = + &getAnalysis().getPSI(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); @@ -454,7 +459,7 @@ bool BFChange = false; if (!PreRegAlloc) { // Tail merge tend to expose more if-conversion opportunities. - BranchFolder BF(true, false, MBFI, *MBPI); + BranchFolder BF(true, false, MBFI, *MBPI, PSI); auto *MMIWP = getAnalysisIfAvailable(); BFChange = BF.OptimizeFunction( MF, TII, ST.getRegisterInfo(), @@ -596,7 +601,7 @@ BBAnalysis.clear(); if (MadeChange && IfCvtBranchFold) { - BranchFolder BF(false, false, MBFI, *MBPI); + BranchFolder BF(false, false, MBFI, *MBPI, PSI); auto *MMIWP = getAnalysisIfAvailable(); BF.OptimizeFunction( MF, TII, MF.getSubtarget().getRegisterInfo(), diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeCalc.h" @@ -94,6 +95,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; InsertPointAnalysis IPA; @@ -146,6 +148,7 @@ MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), + PSI(&pass.getAnalysis().getPSI()), IPA(LIS, mf.getNumBlockIDs()) {} void addToMergeableSpills(MachineInstr &Spill, int StackSlot, @@ -167,6 +170,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit; @@ -202,6 +206,7 @@ MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), + PSI(&pass.getAnalysis().getPSI()), HSpiller(pass, mf, vrm) {} void spill(LiveRangeEdit &) override; @@ -684,7 +689,7 @@ if (DeadDefs.empty()) return; LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n"); - Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA); + Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA); // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions // after rematerialization. To remove a VNI for a vreg from its LiveInterval, @@ -835,8 +840,9 @@ MachineInstrSpan MIS(MI, MI->getParent()); MachineInstr *FoldMI = - LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS) - : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM); + LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, PSI, &MBFI, &LIS) + : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, PSI, &MBFI, &LIS, + &VRM); if (!FoldMI) return false; @@ -1085,7 +1091,7 @@ // Hoisted spills may cause dead code. if (!DeadDefs.empty()) { LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n"); - Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA); + Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA); } // Finally delete the SnippetCopies. @@ -1524,7 +1530,7 @@ RMEnt->RemoveOperand(i - 1); } } - Edit.eliminateDeadDefs(SpillsToRm, None, AA); + Edit.eliminateDeadDefs(SpillsToRm, PSI, &MBFI, None, AA); } } diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -12,8 +12,10 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/VirtRegMap.h" @@ -183,7 +185,9 @@ } bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, - SmallVectorImpl &Dead) { + SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { MachineInstr *DefMI = nullptr, *UseMI = nullptr; // Check that there is a single def and a single use. @@ -226,7 +230,8 @@ if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second) return false; - MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS); + MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, + PSI, MBFI, &LIS); if (!FoldMI) return false; LLVM_DEBUG(dbgs() << " folded: " << *FoldMI); @@ -258,7 +263,8 @@ /// Find all live intervals that need to shrink, then remove the instruction. void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, - AliasAnalysis *AA) { + AliasAnalysis *AA, ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { assert(MI->allDefsAreDead() && "Def isn't really dead"); SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot(); @@ -390,6 +396,8 @@ } void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, ArrayRef RegsBeingSpilled, AliasAnalysis *AA) { ToShrinkSet ToShrink; @@ -397,7 +405,7 @@ for (;;) { // Erase all dead defs. while (!Dead.empty()) - eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA); + eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA, PSI, MBFI); if (ToShrink.empty()) break; @@ -405,7 +413,7 @@ // Shrink just one live interval. Then delete new dead defs. LiveInterval *LI = ToShrink.back(); ToShrink.pop_back(); - if (foldAsLoad(LI, Dead)) + if (foldAsLoad(LI, Dead, PSI, MBFI)) continue; unsigned VReg = LI->reg; if (TheDelegate) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" @@ -41,6 +42,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -362,6 +364,8 @@ /// A handle to the post dominator tree. MachinePostDominatorTree *MPDT; + ProfileSummaryInfo *PSI; + /// Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -537,6 +541,7 @@ if (TailDupPlacement) AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -554,6 +559,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) @@ -2025,7 +2031,10 @@ // i.e. when the layout predecessor does not fallthrough to the loop header. // In practice this never happens though: there always seems to be a preheader // that can fallthrough and that is also placed before the header. - if (F->getFunction().hasOptSize()) + bool OptForSize = F->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(L.getHeader(), PSI, + &MBFI->getMBFI()); + if (OptForSize) return L.getHeader(); MachineBasicBlock *OldTop = nullptr; @@ -2781,6 +2790,11 @@ if (Freq < (LoopHeaderFreq * ColdProb)) continue; + // If the global profiles indicates so, don't align it. + if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) && + !TLI->alignLoopsWithOptSize()) + continue; + // Check for the existence of a non-layout predecessor which would benefit // from aligning this block. MachineBasicBlock *LayoutPred = @@ -2988,6 +3002,7 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MPDT = nullptr; + PSI = &getAnalysis().getPSI(); // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. @@ -3018,10 +3033,13 @@ if (allowTailDupPlacement()) { MPDT = &getAnalysis(); - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); + if (OptForSize) TailDupSize = 1; bool PreRegAlloc = false; - TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize); + TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI, + /* LayoutMode */ true, TailDupSize); precomputeTriangleChains(); } @@ -3037,7 +3055,7 @@ if (MF.size() > 3 && EnableTailMerge) { unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, - *MBPI, TailMergeSize); + *MBPI, PSI, TailMergeSize); auto *MMIWP = getAnalysisIfAvailable(); if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -67,6 +68,7 @@ AliasAnalysis *AA; MachineDominatorTree *DT; MachineRegisterInfo *MRI; + ProfileSummaryInfo *PSI; MachineBlockFrequencyInfo *MBFI; public: @@ -87,6 +89,7 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); } void releaseMemory() override { @@ -538,7 +541,7 @@ // Commute commutable instructions. bool Commuted = false; if (!FoundCSE && MI->isCommutable()) { - if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) { + if (MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI)) { Commuted = true; FoundCSE = VNT.count(NewMI); if (NewMI != MI) { @@ -547,7 +550,7 @@ Changed = true; } else if (!FoundCSE) // MI was changed but it didn't help, commute it back! - (void)TII->commuteInstruction(*MI); + (void)TII->commuteInstruction(*MI, PSI, MBFI); } } @@ -889,6 +892,7 @@ DT = &getAnalysis(); MBFI = &getAnalysis(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); + PSI = &getAnalysis().getPSI(); bool ChangedPRE, ChangedCSE; ChangedPRE = PerformSimplePRE(DT); ChangedCSE = PerformCSE(DT->getRootNode()); diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -12,11 +12,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -66,6 +69,8 @@ MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; + MachineBlockFrequencyInfo *MBFI; + ProfileSummaryInfo *PSI; TargetSchedModel TSchedModel; @@ -82,7 +87,7 @@ StringRef getPassName() const override { return "Machine InstCombiner"; } private: - bool doSubstitute(unsigned NewSize, unsigned OldSize); + bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); bool combineInstructions(MachineBasicBlock *); MachineInstr *getOperandDef(const MachineOperand &MO); unsigned getDepth(SmallVectorImpl &InsInstrs, @@ -131,6 +136,8 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -408,8 +415,9 @@ /// \returns true when new instruction sequence should be generated /// independent if it lengthens critical path or not -bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { - if (OptSize && (NewSize < OldSize)) +bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize, + bool OptForSize) { + if (OptForSize && (NewSize < OldSize)) return true; if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; @@ -507,6 +515,8 @@ SparseSet RegUnits; RegUnits.setUniverse(TRI->getNumRegUnits()); + bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; SmallVector Patterns; @@ -583,7 +593,8 @@ // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + if (SubstituteAlways || + doSubstitute(NewInstCount, OldInstCount, OptForSize)) { insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, RegUnits, IncrementalUpdate); // Eagerly stop after the first pattern fires. @@ -638,6 +649,10 @@ MRI = &MF.getRegInfo(); MLI = &getAnalysis(); Traces = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; MinInstr = nullptr; OptSize = MF.getFunction().hasOptSize(); diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -71,6 +71,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -156,6 +158,8 @@ MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree MachineLoopInfo *MLI; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; public: static char ID; // Pass identification @@ -175,6 +179,8 @@ AU.addRequired(); AU.addPreserved(); } + AU.addRequired(); + AU.addRequired(); } /// Track Def -> Use info used for rewriting copies. @@ -1580,7 +1586,7 @@ auto CP = RI.getCommutePair(); if (CP) { Changed = true; - TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + TII->commuteInstruction(*(RI.getMI()), PSI, MBFI, false, (*CP).first, (*CP).second); LLVM_DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); } @@ -1605,6 +1611,10 @@ MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis() : nullptr; MLI = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; bool Changed = false; @@ -1767,7 +1777,8 @@ unsigned FoldedReg = FoldAsLoadDefReg; MachineInstr *DefMI = nullptr; if (MachineInstr *FoldMI = - TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) { + TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI, + PSI, MBFI)) { // Update LocalMIs since we replaced MI with FoldMI and deleted // DefMI. LLVM_DEBUG(dbgs() << "Replacing: " << *MI); diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -16,6 +16,7 @@ #include "RegAllocBase.h" #include "Spiller.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -190,6 +191,7 @@ AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/LiveInterval.h" @@ -626,6 +627,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(AU); } @@ -3239,6 +3241,7 @@ SpillPlacer = &getAnalysis(); DebugVars = &getAnalysis(); AA = &getAnalysis().getAAResults(); + auto PSI = &getAnalysis().getPSI(); initializeCSRCost(); @@ -3247,7 +3250,7 @@ LLVM_DEBUG(LIS->dump()); SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); - SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI)); + SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, PSI)); ExtraRegInfo.clear(); ExtraRegInfo.resize(MRI->getNumVirtRegs()); NextCascade = 1; diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -40,6 +40,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -549,6 +550,7 @@ au.addPreserved(); au.addRequired(); au.addPreserved(); + au.addRequired(); // Needed for InlineSpiller. MachineFunctionPass::getAnalysisUsage(au); } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -21,6 +21,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -129,6 +131,8 @@ const MachineLoopInfo* Loops; AliasAnalysis *AA; RegisterClassInfo RegClassInfo; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; /// A LaneMask to remember on which subregister live ranges we need to call /// shrinkToUses() later. @@ -538,13 +542,15 @@ AU.addRequired(); AU.addPreserved(); AU.addPreservedID(MachineDominatorsID); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } void RegisterCoalescer::eliminateDeadDefs() { SmallVector NewRegs; LiveRangeEdit(nullptr, NewRegs, *MF, *LIS, - nullptr, this).eliminateDeadDefs(DeadDefs); + nullptr, this).eliminateDeadDefs(DeadDefs, PSI, MBFI); } void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) { @@ -832,7 +838,7 @@ // transformation. Start by commuting the instruction. MachineBasicBlock *MBB = DefMI->getParent(); MachineInstr *NewMI = - TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx); + TII->commuteInstruction(*DefMI, PSI, MBFI, false, UseOpIdx, NewDstIdx); if (!NewMI) return { false, false }; if (Register::isVirtualRegister(IntA.reg) && @@ -3686,6 +3692,10 @@ JoinGlobalCopies = STI.enableJoinGlobalCopies(); else JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // The MachineScheduler does not currently require JoinSplitEdges. This will // either be enabled unconditionally or replaced by a more general live range diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -217,7 +217,7 @@ DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { - ForCodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + ForCodeSize = DAG.shouldOptForSize(); MaximumLegalStoreInBits = 0; for (MVT VT : MVT::all_valuetypes()) @@ -12769,7 +12769,7 @@ // Assume that libcalls are the smallest code. // TODO: This restriction should probably be lifted for vectors. - if (DAG.getMachineFunction().getFunction().hasOptSize()) + if (ForCodeSize) return SDValue(); // pow(X, 0.25) --> sqrt(sqrt(X)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -24,6 +24,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -63,6 +65,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include #include #include @@ -1005,7 +1008,9 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - LegacyDivergenceAnalysis * Divergence) { + LegacyDivergenceAnalysis * Divergence, + ProfileSummaryInfo *PSIin, + BlockFrequencyInfo *BFIin) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -1014,6 +1019,8 @@ LibInfo = LibraryInfo; Context = &MF->getFunction().getContext(); DA = Divergence; + PSI = PSIin; + BFI = BFIin; } SelectionDAG::~SelectionDAG() { @@ -1023,6 +1030,11 @@ delete DbgInfo; } +bool SelectionDAG::shouldOptForSize() const { + return MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(FLI->MBB->getBasicBlock(), PSI, BFI); +} + void SelectionDAG::allnodes_clear() { assert(&*AllNodes.begin() == &EntryNode); AllNodes.remove(AllNodes.begin()); @@ -1427,7 +1439,7 @@ assert((TargetFlags == 0 || isTarget) && "Cannot set target flags on target-independent globals"); if (Alignment == 0) - Alignment = MF->getFunction().hasOptSize() + Alignment = shouldOptForSize() ? getDataLayout().getABITypeAlignment(C->getType()) : getDataLayout().getPrefTypeAlignment(C->getType()); unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool; @@ -5733,12 +5745,13 @@ SrcDelta + G->getOffset()); } -static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { +static bool shouldLowerMemFuncForSize(const MachineFunction &MF, + SelectionDAG &DAG) { // On Darwin, -Os means optimize for size without hurting performance, so // only really optimize for size when -Oz (MinSize) is used. if (MF.getTarget().getTargetTriple().isOSDarwin()) return MF.getFunction().hasMinSize(); - return MF.getFunction().hasOptSize(); + return DAG.shouldOptForSize(); } static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl, @@ -5788,7 +5801,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -5971,7 +5984,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -6077,7 +6090,7 @@ bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); + bool OptSize = shouldLowerMemFuncForSize(MF, DAG); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -28,10 +28,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -5348,8 +5350,8 @@ if (Val == 0) return DAG.getConstantFP(1.0, DL, LHS.getValueType()); - const Function &F = DAG.getMachineFunction().getFunction(); - if (!F.hasOptSize() || + bool OptForSize = DAG.shouldOptForSize(); + if (!OptForSize || // If optimizing for size, don't insert too many multiplies. // This inserts up to 5 multiplies. countPopulation(Val) + Log2_32(Val) < 7) { @@ -10529,7 +10531,7 @@ return; } - SL->findJumpTables(Clusters, &SI, DefaultMBB); + SL->findJumpTables(Clusters, &SI, DefaultMBB, DAG.getPSI(), DAG.getBFI()); SL->findBitTestClusters(Clusters, &SI); LLVM_DEBUG({ diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -27,8 +27,10 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/FastISel.h" @@ -340,6 +342,8 @@ AU.addRequired(); if (UseMBPI && OptLevel != CodeGenOpt::None) AU.addRequired(); + AU.addRequired(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -442,13 +446,17 @@ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable(); LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + auto *PSI = &getAnalysis().getPSI(); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable()); + getAnalysisIfAvailable(), PSI, BFI); FuncInfo->set(Fn, *MF, CurDAG); SwiftError->setFunction(*MF); diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h --- a/llvm/lib/CodeGen/SplitKit.h +++ b/llvm/lib/CodeGen/SplitKit.h @@ -36,6 +36,7 @@ class LiveIntervals; class LiveRangeEdit; +class ProfileSummaryInfo; class MachineBlockFrequencyInfo; class MachineDominatorTree; class MachineLoopInfo; @@ -264,6 +265,7 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + ProfileSummaryInfo *PSI; public: /// ComplementSpillMode - Select how the complement live range should be @@ -444,7 +446,8 @@ /// Newly created intervals will be appended to newIntervals. SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis, VirtRegMap &vrm, MachineDominatorTree &mdt, - MachineBlockFrequencyInfo &mbfi); + MachineBlockFrequencyInfo &mbfi, + ProfileSummaryInfo *PSI); /// reset - Prepare for a new split. void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition); diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -365,12 +365,13 @@ SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis, VirtRegMap &vrm, MachineDominatorTree &mdt, - MachineBlockFrequencyInfo &mbfi) + MachineBlockFrequencyInfo &mbfi, + ProfileSummaryInfo *PSI) : SA(sa), AA(aa), LIS(lis), VRM(vrm), MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt), TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()), TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()), - MBFI(mbfi), RegAssign(Allocator) {} + MBFI(mbfi), PSI(PSI), RegAssign(Allocator) {} void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) { Edit = &LRE; @@ -1429,7 +1430,7 @@ if (Dead.empty()) return; - Edit->eliminateDeadDefs(Dead, None, &AA); + Edit->eliminateDeadDefs(Dead, PSI, &MBFI, None, &AA); } void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) { diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp --- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp +++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp @@ -40,9 +40,12 @@ return NumCases; } -void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters, - const SwitchInst *SI, - MachineBasicBlock *DefaultMBB) { +void SwitchCG::SwitchLowering::findJumpTables( + CaseClusterVector &Clusters, + const SwitchInst *SI, + MachineBasicBlock *DefaultMBB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { #ifndef NDEBUG // Clusters must be non-empty, sorted, and only contain Range clusters. assert(!Clusters.empty()); @@ -80,7 +83,7 @@ assert(Range >= NumCases); // Cheap case: the whole range may be suitable for jump table. - if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) { + if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) { CaseCluster JTCluster; if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) { Clusters[0] = JTCluster; @@ -138,7 +141,7 @@ assert(NumCases < UINT64_MAX / 100); assert(Range >= NumCases); - if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) { + if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) { unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]); unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1]; int64_t NumEntries = j - i + 1; diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp --- a/llvm/lib/CodeGen/TailDuplication.cpp +++ b/llvm/lib/CodeGen/TailDuplication.cpp @@ -12,6 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -37,6 +39,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -74,7 +78,11 @@ return false; auto MBPI = &getAnalysis(); - Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false); + auto *PSI = &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks()) diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -19,13 +19,16 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -77,6 +80,8 @@ void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPIin, + const MachineBlockFrequencyInfo *MBFIin, + ProfileSummaryInfo *PSIin, bool LayoutModeIn, unsigned TailDupSizeIn) { MF = &MFin; TII = MF->getSubtarget().getInstrInfo(); @@ -84,6 +89,8 @@ MRI = &MF->getRegInfo(); MMI = &MF->getMMI(); MBPI = MBPIin; + MBFI = MBFIin; + PSI = PSIin; TailDupSize = TailDupSizeIn; assert(MBPI != nullptr && "Machine Branch Probability Info required"); @@ -555,14 +562,14 @@ // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; - if (TailDupSize == 0 && - TailDuplicateSize.getNumOccurrences() == 0 && - MF->getFunction().hasOptSize()) - MaxDuplicateCount = 1; - else if (TailDupSize == 0) + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI); + if (TailDupSize == 0) MaxDuplicateCount = TailDuplicateSize; else MaxDuplicateCount = TailDupSize; + if (OptForSize) + MaxDuplicateCount = 1; // If the block to be duplicated ends in an unanalyzable fallthrough, don't // duplicate it. diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -154,6 +154,8 @@ } MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned Idx1, unsigned Idx2) const { const MCInstrDesc &MCID = MI.getDesc(); @@ -236,7 +238,10 @@ return CommutedMI; } -MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI, +MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose @@ -248,7 +253,7 @@ "Precondition violation: MI must be commutable."); return nullptr; } - return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, OpIdx2); } bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1, @@ -530,6 +535,8 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, int FI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { auto Flags = MachineMemOperand::MONone; @@ -577,7 +584,7 @@ MBB->insert(MI, NewMI); } else { // Ask the target to do the actual folding. - NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM); + NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, PSI, MBFI, LIS, VRM); } if (NewMI) { @@ -619,6 +626,8 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, ArrayRef Ops, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!"); #ifndef NDEBUG @@ -643,7 +652,7 @@ NewMI = &*MBB.insert(MI, NewMI); } else { // Ask the target to do the actual folding. - NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS); + NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, PSI, MBFI, LIS); } if (!NewMI) diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -33,6 +33,8 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" @@ -99,6 +101,8 @@ LiveIntervals *LIS; AliasAnalysis *AA; CodeGenOpt::Level OptLevel; + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; // The current basic block being processed. MachineBasicBlock *MBB; @@ -188,6 +192,8 @@ AU.addPreserved(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -684,7 +690,8 @@ unsigned Dist) { Register RegC = MI->getOperand(RegCIdx).getReg(); LLVM_DEBUG(dbgs() << "2addr: COMMUTING : " << *MI); - MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx); + MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI, false, RegBIdx, + RegCIdx); if (NewMI == nullptr) { LLVM_DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n"); @@ -1681,6 +1688,10 @@ // fixups are necessary for correctness. if (skipFunction(Func.getFunction())) OptLevel = CodeGenOpt::None; + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; bool MadeChange = false; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -162,6 +162,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3177,6 +3177,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { // This is a bit of a hack. Consider this instruction: // diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -515,7 +515,7 @@ auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); - if (TII->commuteInstruction(*NewMI)) { + if (TII->commuteInstruction(*NewMI, nullptr, nullptr)) { LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -276,7 +276,7 @@ MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); if (Fold.isCommuted()) - TII.commuteInstruction(*Inst32, false); + TII.commuteInstruction(*Inst32, nullptr, nullptr, false); return true; } @@ -393,7 +393,8 @@ return false; if (!CanCommute || - !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) + !TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0, + CommuteIdx1)) return false; if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { @@ -421,7 +422,8 @@ return true; } - TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); + TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0, + CommuteIdx1); return false; } @@ -1230,7 +1232,7 @@ tryFoldInst(TII, Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. - TII->commuteInstruction(*Fold.UseMI, false); + TII->commuteInstruction(*Fold.UseMI, nullptr, nullptr, false); } } } diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -371,7 +371,7 @@ MachineOperand &Op1 = A->getOperand(1); MachineOperand &Op2 = A->getOperand(2); if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { - TII->commuteInstruction(*A); + TII->commuteInstruction(*A, nullptr, nullptr); Changed = true; } if (Op1.getReg() != ExecReg) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -137,7 +137,10 @@ MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1662,7 +1662,10 @@ return &MI; } -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned Src0Idx, unsigned Src1Idx) const { assert(!NewMI && "this should never be used"); @@ -1686,7 +1689,8 @@ if (isOperandLegal(MI, Src1Idx, &Src0)) { // Be sure to copy the source modifiers to the right place. CommutedMI - = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); + = TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, + Src0Idx, Src1Idx); } } else if (Src0.isReg() && !Src1.isReg()) { @@ -2452,7 +2456,7 @@ if (Def && Def->isMoveImmediate() && isInlineConstant(Def->getOperand(1)) && MRI->hasOneUse(Src1->getReg()) && - commuteInstruction(UseMI)) { + commuteInstruction(UseMI, nullptr, nullptr)) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); } else if ((Register::isPhysicalRegister(Src1->getReg()) && RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -113,12 +113,12 @@ // We have failed to fold src0, so commute the instruction and try again. if (TryToCommute && MI.isCommutable()) { - if (TII->commuteInstruction(MI)) { + if (TII->commuteInstruction(MI, nullptr, nullptr)) { if (foldImmediates(MI, TII, MRI, false)) return true; // Commute back. - TII->commuteInstruction(MI); + TII->commuteInstruction(MI, nullptr, nullptr); } } @@ -183,7 +183,7 @@ // cmpk instructions do scc = dst imm16, so commute the instruction to // get constants on the RHS. if (!MI.getOperand(0).isReg()) - TII->commuteInstruction(MI, false, 0, 1); + TII->commuteInstruction(MI, nullptr, nullptr, false, 0, 1); const MachineOperand &Src1 = MI.getOperand(1); if (!Src1.isImm()) @@ -355,7 +355,7 @@ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && SrcImm == Src0) { - if (!TII->commuteInstruction(MI, false, 1, 2)) + if (!TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2)) NewImm = 0; } @@ -634,7 +634,7 @@ MachineOperand *Src1 = &MI.getOperand(2); if (!Src0->isReg() && Src1->isReg()) { - if (TII->commuteInstruction(MI, false, 1, 2)) + if (TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2)) std::swap(Src0, Src1); } @@ -704,7 +704,8 @@ if (!TII->canShrink(MI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(MI) || + if (!MI.isCommutable() || + !TII->commuteInstruction(MI, nullptr, nullptr) || !TII->canShrink(MI, MRI)) continue; } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -96,7 +96,10 @@ /// non-commutable pair of operand indices OpIdx1 and OpIdx2. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; @@ -326,7 +329,9 @@ /// VFP/NEON execution domains. std::pair getExecutionDomain(const MachineInstr &MI) const override; - void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; + void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; unsigned getPartialRegUpdateClearance(const MachineInstr &, unsigned, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2146,6 +2146,8 @@ } MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -2159,7 +2161,8 @@ if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; MachineInstr *CommutedMI = - TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); if (!CommutedMI) return nullptr; // After swapping the MOVCC operands, also invert the condition. @@ -2168,7 +2171,8 @@ return CommutedMI; } } - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); } /// Identify instructions that can be folded into a MOVCC instruction, and @@ -4926,7 +4930,9 @@ } void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI, - unsigned Domain) const { + unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { unsigned DstReg, SrcReg, DReg; unsigned Lane; MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -758,7 +758,7 @@ if (Reg1 != Reg0) return false; // Try to commute the operands to make it a 2-address instruction. - MachineInstr *CommutedMI = TII->commuteInstruction(*MI); + MachineInstr *CommutedMI = TII->commuteInstruction(*MI, nullptr, nullptr); if (!CommutedMI) return false; } @@ -770,7 +770,8 @@ MI->getOperand(CommOpIdx2).getReg() != Reg0) return false; MachineInstr *CommutedMI = - TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2); + TII->commuteInstruction(*MI, nullptr, nullptr, false, + CommOpIdx1, CommOpIdx2); if (!CommutedMI) return false; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -171,7 +171,10 @@ /// /// For example, we can commute rlwimi instructions, but only if the /// rotate amt is zero. We also have to munge the immediates a bit. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -365,14 +365,18 @@ return 0; } -MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { MachineFunction &MF = *MI.getParent()->getParent(); // Normal instructions can be commuted the obvious way. if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo) - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, + OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because // changing the relative order of the mask operands might change what happens diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -194,7 +194,10 @@ /// non-commutable operands. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override; @@ -261,11 +264,15 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const override; bool expandPostRAPseudo(MachineInstr &MBBI) const override; bool reverseBranchCondition(SmallVectorImpl &Cond) const diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -270,6 +270,8 @@ } MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { @@ -293,11 +295,13 @@ unsigned CCValid = WorkingMI.getOperand(3).getImm(); unsigned CCMask = WorkingMI.getOperand(4).getImm(); WorkingMI.getOperand(4).setImm(CCMask ^ CCValid); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } default: - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, + NewMI, OpIdx1, OpIdx2); } } @@ -655,7 +659,7 @@ } if (CommuteIdx != -1) - if (!commuteInstruction(UseMI, false, CommuteIdx, UseIdx)) + if (!commuteInstruction(UseMI, nullptr, nullptr, false, CommuteIdx, UseIdx)) return false; bool DeleteDef = MRI->hasOneNonDBGUse(Reg); @@ -997,6 +1001,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS, VirtRegMap *VRM) const { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1210,6 +1215,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { return nullptr; } diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp --- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp +++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -137,7 +137,7 @@ // If the destination (now) matches one source, prefer this to be first. if (DestReg != Src1Reg && DestReg == Src2Reg) { - TII->commuteInstruction(*MBBI, false, 1, 2); + TII->commuteInstruction(*MBBI, nullptr, nullptr, false, 1, 2); std::swap(Src1Reg, Src2Reg); std::swap(Src1IsHigh, Src2IsHigh); } @@ -269,4 +269,3 @@ return Modified; } - diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp --- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -185,7 +185,7 @@ return true; } if (MI.getOperand(0).getReg() == MI.getOperand(2).getReg()) { - TII->commuteInstruction(MI, false, 1, 2); + TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2); MI.setDesc(TII->get(Opcode)); MI.tieOperands(0, 1); return true; @@ -338,7 +338,7 @@ if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) && (!MI.isCommutable() || MI.getOperand(0).getReg() != MI.getOperand(2).getReg() || - !TII->commuteInstruction(MI, false, 1, 2))) + !TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2))) break; MI.setDesc(TII->get(TwoOperandOpcode)); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -48,7 +48,10 @@ void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -85,7 +85,9 @@ } MachineInstr *WebAssemblyInstrInfo::commuteInstructionImpl( - MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { + MachineInstr &MI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { // If the operands are stackified, we can't reorder them. WebAssemblyFunctionInfo &MFI = *MI.getParent()->getParent()->getInfo(); @@ -94,7 +96,8 @@ return nullptr; // Otherwise use the default implementation. - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, + OpIdx1, OpIdx2); } // Branch analysis. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -740,7 +740,8 @@ assert(!Declined && "Don't decline commuting until you've finished trying it"); // Commuting didn't help. Revert it. - TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); + TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false, + Operand0, Operand1); TentativelyCommuting = false; Declined = true; } else if (!Declined && TreeWalker.hasRemainingOperands(Insert)) { @@ -748,7 +749,8 @@ Operand1 = TargetInstrInfo::CommuteAnyOperandIndex; if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) { // Tentatively commute the operands and try again. - TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1); + TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false, + Operand0, Operand1); TreeWalker.resetTopOperands(Insert); TentativelyCommuting = true; Declined = false; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -179,6 +179,11 @@ bool Op0IsKill, unsigned Op1, bool Op1IsKill, unsigned Op2, bool Op2IsKill, unsigned Op3, bool Op3IsKill); + + bool shouldOptForSize(const MachineFunction *MF) const { + // TODO: Implement PGSO. + return MF->getFunction().hasOptSize(); + } }; } // end anonymous namespace. @@ -3941,7 +3946,7 @@ MachineInstr *Result = XII.foldMemoryOperandImpl( *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment, - /*AllowCommute=*/true); + /*AllowCommute=*/true, nullptr, nullptr); if (!Result) return false; diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -48,11 +48,14 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" @@ -113,6 +116,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); // Machine loop info is used to // guide some heuristics. + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -140,6 +145,9 @@ /// Register Liveness information after the current instruction. LivePhysRegs LiveRegs; + + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; }; char FixupBWInstPass::ID = 0; } @@ -154,8 +162,11 @@ this->MF = &MF; TII = MF.getSubtarget().getInstrInfo(); - OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis(); + PSI = &getAnalysis().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; LiveRegs.init(TII->getRegisterInfo()); LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";); @@ -426,6 +437,9 @@ // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); + OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -335,7 +335,7 @@ // Do not want to hoist if we're not optimizing for size. // TODO: We'd like to remove this restriction. // See the comment in X86InstrInfo.td for more info. - if (!OptForSize) + if (!CurDAG->shouldOptForSize()) return false; // Walk all the users of the immediate. @@ -3019,7 +3019,7 @@ LLVM_FALLTHROUGH; case X86ISD::ADD: // Try to match inc/dec. - if (!Subtarget->slowIncDec() || OptForSize) { + if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -8306,7 +8308,7 @@ // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -11161,7 +11163,7 @@ case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -18307,7 +18309,7 @@ "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -18532,7 +18534,7 @@ /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -20399,7 +20401,7 @@ } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -39598,7 +39600,7 @@ return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -24,7 +24,9 @@ #include "X86GenInstrInfo.inc" namespace llvm { +class MachineBlockFrequencyInfo; class MachineInstrBuilder; +class ProfileSummaryInfo; class X86RegisterInfo; class X86Subtarget; @@ -341,6 +343,8 @@ foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; @@ -350,6 +354,7 @@ MachineInstr *foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS = nullptr) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or @@ -427,9 +432,13 @@ uint16_t getExecutionDomainCustom(const MachineInstr &MI) const; - void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; + void setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; - bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const; + bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const; unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, @@ -444,7 +453,9 @@ ArrayRef MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, unsigned Alignment, - bool AllowCommute) const; + bool AllowCommute, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const; bool isHighLatencyDef(int opc) const override; @@ -490,7 +501,9 @@ MachineInstr *optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const override; + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const override; std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; @@ -537,7 +550,10 @@ /// non-commutable operands. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -19,14 +19,17 @@ #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -1524,7 +1527,10 @@ #undef VPERM_CASES } -MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & { @@ -1555,7 +1561,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.getOperand(3).setImm(Size - Amt); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PFSUBrr: @@ -1566,15 +1573,20 @@ (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::VBLENDPDrri: - case X86::VBLENDPSrri: + case X86::VBLENDPSrri: { // If we're optimizing for size, try to use MOVSD/MOVSS. - if (MI.getParent()->getParent()->getFunction().hasOptSize()) { + auto *MBB = MI.getParent(); + auto MF = MBB->getParent(); + bool OptForSize = MF->getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); @@ -1588,11 +1600,13 @@ WorkingMI.setDesc(get(Opc)); WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, + PSI, MBFI, /*NewMI=*/false, OpIdx1, OpIdx2); } } LLVM_FALLTHROUGH; + } case X86::PBLENDWrri: case X86::VBLENDPDYrri: case X86::VBLENDPSYrri: @@ -1621,7 +1635,8 @@ int8_t Imm = MI.getOperand(3).getImm() & Mask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::INSERTPSrr: @@ -1641,7 +1656,8 @@ unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } return nullptr; @@ -1664,7 +1680,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1675,7 +1692,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::SHUFPDrri)); WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::SHUFPDrri: { @@ -1684,7 +1702,8 @@ auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::MOVSDrr)); WorkingMI.RemoveOperand(3); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: @@ -1700,7 +1719,8 @@ unsigned Src2Hi = Imm & 0x10; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: @@ -1732,7 +1752,8 @@ Imm = X86::getSwappedVPCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCOMBri: case X86::VPCOMUBri: @@ -1744,7 +1765,8 @@ Imm = X86::getSwappedVPCOMImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VCMPSDZrr: @@ -1765,7 +1787,8 @@ Imm = X86::getSwappedVCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPERM2F128rr: @@ -1776,7 +1799,8 @@ int8_t Imm = MI.getOperand(3).getImm() & 0xFF; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm ^ 0x22); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::MOVHLPSrr: @@ -1799,7 +1823,8 @@ } auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { @@ -1807,7 +1832,8 @@ unsigned OpNo = MI.getDesc().getNumOperands() - 1; X86::CondCode CC = static_cast(MI.getOperand(OpNo).getImm()); WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: @@ -1842,7 +1868,8 @@ case X86::VPTERNLOGQZrmbikz: { auto &WorkingMI = cloneIfNew(MI); commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } default: { @@ -1850,7 +1877,8 @@ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -1861,11 +1889,13 @@ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI, + /*NewMI=*/false, OpIdx1, OpIdx2); } - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, + NewMI, OpIdx1, OpIdx2); } } } @@ -3833,7 +3863,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { + MachineInstr *&DefMI, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); @@ -3859,7 +3891,8 @@ return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI, + PSI, MBFI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4822,7 +4855,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align, bool AllowCommute) const { + unsigned Size, unsigned Align, bool AllowCommute, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI) const { bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; @@ -4955,7 +4989,7 @@ return nullptr; MachineInstr *CommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2); if (!CommutedMI) { // Unable to commute. return nullptr; @@ -4968,13 +5002,14 @@ // Attempt to fold with the commuted version of the instruction. NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, - Size, Align, /*AllowCommute=*/false); + Size, Align, /*AllowCommute=*/false, + PSI, MBFI); if (NewMI) return NewMI; // Folding failed again - undo the commute before returning. MachineInstr *UncommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2); if (!UncommutedMI) { // Unable to commute. return nullptr; @@ -5000,7 +5035,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, - int FrameIndex, LiveIntervals *LIS, + int FrameIndex, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI, + LiveIntervals *LIS, VirtRegMap *VRM) const { // Check switch flag if (NoFusing) @@ -5050,7 +5088,8 @@ return foldMemoryOperandImpl(MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, - Size, Alignment, /*AllowCommute=*/true); + Size, Alignment, /*AllowCommute=*/true, + PSI, MBFI); } /// Check if \p LoadMI is a partial register load that we can't fold into \p MI @@ -5191,6 +5230,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, LiveIntervals *LIS) const { // TODO: Support the case where LoadMI loads a wide register, but MI @@ -5206,7 +5246,8 @@ if (isLoadFromStackSlot(LoadMI, FrameIndex)) { if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; - return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS); + return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, PSI, MBFI, + LIS); } // Check switch flag @@ -5358,7 +5399,8 @@ } } return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt, - /*Size=*/0, Alignment, /*AllowCommute=*/true); + /*Size=*/0, Alignment, /*AllowCommute=*/true, + PSI, MBFI); } static SmallVector @@ -6685,7 +6727,9 @@ } bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, - unsigned Domain) const { + unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { assert(Domain > 0 && Domain < 4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); @@ -6794,7 +6838,7 @@ MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0) { - commuteInstruction(MI, false); + commuteInstruction(MI, PSI, MBFI, false); return true; } // We must always return true for MOVHLPSrr. @@ -6857,13 +6901,15 @@ return std::make_pair(domain, validDomains); } -void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { +void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); // Attempt to match for custom instructions. - if (setExecutionDomainCustom(MI, Domain)) + if (setExecutionDomainCustom(MI, Domain, PSI, MBFI)) return; const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -983,12 +983,12 @@ // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction().hasOptSize()">; + def OptForSize : Predicate<"shouldOptForSize(MF)">; def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">; + def OptForSpeed : Predicate<"!shouldOptForSize(MF)">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " - "MF->getFunction().hasOptSize()">; - def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || " + "shouldOptForSize(MF)">; + def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || " "!Subtarget->hasSSE41()">; } diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -25,6 +25,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -32,6 +34,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -247,6 +250,12 @@ static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: using MemOpMap = DenseMap>; @@ -681,6 +690,11 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; // Process all basic blocks. for (auto &MBB : MF) { @@ -699,7 +713,9 @@ // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + if (OptForSize) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -17,8 +17,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Function.h" @@ -52,6 +55,12 @@ bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -105,6 +114,12 @@ TSM.init(&MF.getSubtarget()); + auto *PSI = + &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis().getBFI() : + nullptr; + // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); @@ -118,6 +133,11 @@ MachineBasicBlock *MBB = I->first; unsigned Cycles = I->second; + // Function::hasOptSize is already checked above. + bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) + continue; + if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions // trailing the terminator. diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -45,13 +46,17 @@ ; CHECK-NEXT: Analysis for ComputingKnownBits ; CHECK-NEXT: InstructionSelect ; CHECK-NEXT: ResetMachineFunction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Post-RA pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -35,6 +35,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -77,10 +80,13 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -92,6 +98,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: AArch64 Conditional Compares +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: AArch64 Conditional Branch Tuning ; CHECK-NEXT: Machine Trace Metrics @@ -107,6 +114,7 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: AArch64 Dead register definitions @@ -115,6 +123,7 @@ ; CHECK-NEXT: Remove unreachable machine basic blocks ; CHECK-NEXT: Live Variable Analysis ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Slot index numbering @@ -129,7 +138,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -148,6 +156,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll @@ -0,0 +1,128 @@ +; RUN: llc %s -enable-machine-outliner=never -mtriple=arm64-linux-gnu -o - | \ +; RUN: FileCheck --check-prefixes=CHECK,CHECK-LINUX %s +; ARM64: Calls to bzero() replaced with calls to memset() + +; CHECK-LABEL: fct1: +; For small size (<= 256), we do not change memset to bzero. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct1(i8* nocapture %ptr) !prof !14 { +entry: + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) + +; CHECK-LABEL: fct2: +; When the size is bigger than 256, change into bzero. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct2(i8* nocapture %ptr) !prof !14 { +entry: + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false) + ret void +} + +; CHECK-LABEL: fct3: +; For unknown size, change to bzero. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct3(i8* nocapture %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false) + ret void +} + +; CHECK-LABEL: fct4: +; Size <= 256, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct4(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp) + ret void +} + +declare i8* @__memset_chk(i8*, i32, i64, i64) + +declare i64 @llvm.objectsize.i64(i8*, i1) + +; CHECK-LABEL: fct5: +; Size > 256, change. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct5(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct6: +; Size = unknown, change. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct6(i8* %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp) + ret void +} + +; Next functions check that memset is not turned into bzero +; when the set constant is non-zero, whatever the given size. + +; CHECK-LABEL: fct7: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct7(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct8: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct8(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct9: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct9(i8* %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll --- a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll +++ b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll @@ -18,7 +18,6 @@ ; GreedyRegAlloc, please adjust accordingly.) ; HOTNESS: Executing Pass 'Spill Code Placement Analysis' -; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis' ; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter' ; HOTNESS-NEXT: MachineBlockFrequencyInfo is available ; HOTNESS-NEXT: Executing Pass 'Greedy Register Allocator' diff --git a/llvm/test/CodeGen/AArch64/max-jump-table.ll b/llvm/test/CodeGen/AArch64/max-jump-table.ll --- a/llvm/test/CodeGen/AArch64/max-jump-table.ll +++ b/llvm/test/CodeGen/AArch64/max-jump-table.ll @@ -215,3 +215,136 @@ return: ret void } + +define i32 @jt1_optsize(i32 %a, i32 %b) optsize { +entry: + switch i32 %a, label %return [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb5 + i32 6, label %bb6 + i32 7, label %bb7 + i32 8, label %bb8 + i32 9, label %bb9 + i32 10, label %bb10 + i32 11, label %bb11 + i32 12, label %bb12 + i32 13, label %bb13 + i32 14, label %bb14 + i32 15, label %bb15 + i32 16, label %bb16 + i32 17, label %bb17 + ] +; CHECK-LABEL: function jt1_optsize: +; CHECK-NEXT: Jump Tables: +; CHECK0-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK0-NOT: %jump-table.1: +; CHECK4-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK4-NOT: %jump-table.1: +; CHECK8-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK8-NOT: %jump-table.1: +; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK16-NOT: %jump-table.1: +; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM1-NOT: %jump-table.1: +; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM3-NOT: %jump-table.1: +; CHECK-DAG: End machine code for function jt1_optsize. + +bb1: tail call void @ext(i32 1, i32 0) br label %return +bb2: tail call void @ext(i32 2, i32 2) br label %return +bb3: tail call void @ext(i32 3, i32 4) br label %return +bb4: tail call void @ext(i32 4, i32 6) br label %return +bb5: tail call void @ext(i32 5, i32 8) br label %return +bb6: tail call void @ext(i32 6, i32 10) br label %return +bb7: tail call void @ext(i32 7, i32 12) br label %return +bb8: tail call void @ext(i32 8, i32 14) br label %return +bb9: tail call void @ext(i32 9, i32 16) br label %return +bb10: tail call void @ext(i32 1, i32 18) br label %return +bb11: tail call void @ext(i32 2, i32 20) br label %return +bb12: tail call void @ext(i32 3, i32 22) br label %return +bb13: tail call void @ext(i32 4, i32 24) br label %return +bb14: tail call void @ext(i32 5, i32 26) br label %return +bb15: tail call void @ext(i32 6, i32 28) br label %return +bb16: tail call void @ext(i32 7, i32 30) br label %return +bb17: tail call void @ext(i32 8, i32 32) br label %return + +return: ret i32 %b +} + +define i32 @jt1_pgso(i32 %a, i32 %b) !prof !14 { +entry: + switch i32 %a, label %return [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb5 + i32 6, label %bb6 + i32 7, label %bb7 + i32 8, label %bb8 + i32 9, label %bb9 + i32 10, label %bb10 + i32 11, label %bb11 + i32 12, label %bb12 + i32 13, label %bb13 + i32 14, label %bb14 + i32 15, label %bb15 + i32 16, label %bb16 + i32 17, label %bb17 + ] +; CHECK-LABEL: function jt1_pgso: +; CHECK-NEXT: Jump Tables: +; CHECK0-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK0-NOT: %jump-table.1: +; CHECK4-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK4-NOT: %jump-table.1: +; CHECK8-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK8-NOT: %jump-table.1: +; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK16-NOT: %jump-table.1: +; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM1-NOT: %jump-table.1: +; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM3-NOT: %jump-table.1: +; CHECK-DAG: End machine code for function jt1_pgso. + +bb1: tail call void @ext(i32 1, i32 0) br label %return +bb2: tail call void @ext(i32 2, i32 2) br label %return +bb3: tail call void @ext(i32 3, i32 4) br label %return +bb4: tail call void @ext(i32 4, i32 6) br label %return +bb5: tail call void @ext(i32 5, i32 8) br label %return +bb6: tail call void @ext(i32 6, i32 10) br label %return +bb7: tail call void @ext(i32 7, i32 12) br label %return +bb8: tail call void @ext(i32 8, i32 14) br label %return +bb9: tail call void @ext(i32 9, i32 16) br label %return +bb10: tail call void @ext(i32 1, i32 18) br label %return +bb11: tail call void @ext(i32 2, i32 20) br label %return +bb12: tail call void @ext(i32 3, i32 22) br label %return +bb13: tail call void @ext(i32 4, i32 24) br label %return +bb14: tail call void @ext(i32 5, i32 26) br label %return +bb15: tail call void @ext(i32 6, i32 28) br label %return +bb16: tail call void @ext(i32 7, i32 30) br label %return +bb17: tail call void @ext(i32 8, i32 32) br label %return + +return: ret i32 %b +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -19,6 +19,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -64,8 +67,11 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: ARM Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -80,6 +86,7 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: ARM MLA / MLS expansion pass @@ -92,6 +99,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis @@ -105,7 +113,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -121,11 +128,13 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: ARM load / store optimization pass ; CHECK-NEXT: ReachingDefAnalysis +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: ARM Execution Domain Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: ARM pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/ARM/constantpool-align.ll b/llvm/test/CodeGen/ARM/constantpool-align.ll --- a/llvm/test/CodeGen/ARM/constantpool-align.ll +++ b/llvm/test/CodeGen/ARM/constantpool-align.ll @@ -17,3 +17,28 @@ store <4 x i32> , <4 x i32>* %p, align 4 ret void } + +; CHECK-LABEL: f_pgso: +; CHECK: vld1.64 {{.*}}, [r1] +; CHECK: .p2align 3 +define void @f_pgso(<4 x i32>* %p) !prof !14 { + store <4 x i32> , <4 x i32>* %p, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -23,6 +23,17 @@ ret void } +; Perform tail call optimization for external symbol. +@dest_pgso = global [2 x i8] zeroinitializer +define void @caller_extern_pgso(i8* %src) !prof !14 { +entry: +; CHECK: caller_extern_pgso +; CHECK-NOT: call memcpy +; CHECK: tail memcpy + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @dest_pgso, i32 0, i32 0), i8* %src, i32 7, i1 false) + ret void +} + ; Perform indirect tail call optimization (for function pointer call). declare void @callee_indirect1() declare void @callee_indirect2() @@ -146,3 +157,20 @@ tail call void @callee_nostruct() ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -14,6 +14,7 @@ ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -37,6 +38,10 @@ ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions @@ -46,11 +51,11 @@ ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Post-RA pseudo instruction expansion pass diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering @@ -32,6 +32,9 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -64,12 +67,15 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Branch Probability Analysis +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering @@ -80,6 +86,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Early If-Conversion +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: X86 cmov Conversion ; CHECK-NEXT: MachineDominator Tree Construction @@ -90,10 +97,12 @@ ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: Live Range Shrink ; CHECK-NEXT: X86 Fixup SetCC +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame ; CHECK-NEXT: X86 Avoid Store Forwarding Block @@ -108,6 +117,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis @@ -121,7 +131,6 @@ ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis -; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter @@ -139,6 +148,7 @@ ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass @@ -151,13 +161,16 @@ ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Branch Probability Basic Block Placement ; CHECK-NEXT: ReachingDefAnalysis +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Execution Dependency Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Byte/Word Instruction Fixup +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible diff --git a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll --- a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll +++ b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll @@ -29,6 +29,13 @@ ret i32 %a } +define i32 @test_pgso(i32 %a) nounwind !prof !14 { +; CHECK: test_pgso +; CHECK: movl +; CHECK-NEXT: ret + ret i32 %a +} + define i32 @test_add(i32 %a, i32 %b) nounwind { ; CHECK: test_add ; CHECK: addl @@ -101,3 +108,19 @@ ret void } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -190,6 +190,16 @@ ret float %res } +define float @floor_f32_load_pgso(float* %aptr) !prof !14 { +; CHECK-LABEL: floor_f32_load_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = load float, float* %aptr + %res = call float @llvm.floor.f32(float %a) + ret float %res +} + define double @nearbyint_f64_load(double* %aptr) optsize { ; CHECK-LABEL: nearbyint_f64_load: ; CHECK: # %bb.0: @@ -200,3 +210,29 @@ ret double %res } +define double @nearbyint_f64_load_pgso(double* %aptr) !prof !14 { +; CHECK-LABEL: nearbyint_f64_load_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = load double, double* %aptr + %res = call double @llvm.nearbyint.f64(double %a) + ret double %res +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1970,6 +1970,47 @@ ret <32 x i16> %ret } +define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 { +; KNL-LABEL: test_build_vec_v32i1_pgso: +; KNL: ## %bb.0: +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_build_vec_v32i1_pgso: +; SKX: ## %bb.0: +; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_build_vec_v32i1_pgso: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_build_vec_v32i1_pgso: +; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_build_vec_v32i1_pgso: +; X86: ## %bb.0: +; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl + %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer + ret <32 x i16> %ret +} + define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## %bb.0: @@ -2013,12 +2054,12 @@ ; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB43_2 +; KNL-NEXT: je LBB44_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovapd %zmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB43_2: ## %L2 +; KNL-NEXT: LBB44_2: ## %L2 ; KNL-NEXT: vmovapd %zmm0, 8(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -2029,12 +2070,12 @@ ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; SKX-NEXT: ktestb %k0, %k1 -; SKX-NEXT: je LBB43_2 +; SKX-NEXT: je LBB44_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB43_2: ## %L2 +; SKX-NEXT: LBB44_2: ## %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -2046,12 +2087,12 @@ ; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB43_2 +; AVX512BW-NEXT: je LBB44_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovapd %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB43_2: ## %L2 +; AVX512BW-NEXT: LBB44_2: ## %L2 ; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2062,12 +2103,12 @@ ; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: ktestb %k0, %k1 -; AVX512DQ-NEXT: je LBB43_2 +; AVX512DQ-NEXT: je LBB44_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB43_2: ## %L2 +; AVX512DQ-NEXT: LBB44_2: ## %L2 ; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2079,12 +2120,12 @@ ; X86-NEXT: vmovupd 8(%eax), %zmm1 {%k1} {z} ; X86-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; X86-NEXT: ktestb %k0, %k1 -; X86-NEXT: je LBB43_2 +; X86-NEXT: je LBB44_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovapd %zmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB43_2: ## %L2 +; X86-NEXT: LBB44_2: ## %L2 ; X86-NEXT: vmovapd %zmm0, 8(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -2131,13 +2172,13 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB44_2 +; KNL-NEXT: je LBB45_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovaps %zmm0, (%rdi) ; KNL-NEXT: vmovaps %zmm1, 64(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB44_2: ## %L2 +; KNL-NEXT: LBB45_2: ## %L2 ; KNL-NEXT: vmovaps %zmm0, 4(%rdi) ; KNL-NEXT: vmovaps %zmm1, 68(%rdi) ; KNL-NEXT: vzeroupper @@ -2154,13 +2195,13 @@ ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k1 ; SKX-NEXT: kortestd %k1, %k0 -; SKX-NEXT: je LBB44_2 +; SKX-NEXT: je LBB45_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB44_2: ## %L2 +; SKX-NEXT: LBB45_2: ## %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) ; SKX-NEXT: vzeroupper @@ -2177,13 +2218,13 @@ ; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1 ; AVX512BW-NEXT: kortestd %k1, %k0 -; AVX512BW-NEXT: je LBB44_2 +; AVX512BW-NEXT: je LBB45_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB44_2: ## %L2 +; AVX512BW-NEXT: LBB45_2: ## %L2 ; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512BW-NEXT: vzeroupper @@ -2203,13 +2244,13 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB44_2 +; AVX512DQ-NEXT: je LBB45_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB44_2: ## %L2 +; AVX512DQ-NEXT: LBB45_2: ## %L2 ; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512DQ-NEXT: vzeroupper @@ -2227,13 +2268,13 @@ ; X86-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; X86-NEXT: kunpckwd %k1, %k2, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB44_2 +; X86-NEXT: je LBB45_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovaps %zmm0, (%eax) ; X86-NEXT: vmovaps %zmm1, 64(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB44_2: ## %L2 +; X86-NEXT: LBB45_2: ## %L2 ; X86-NEXT: vmovaps %zmm0, 4(%eax) ; X86-NEXT: vmovaps %zmm1, 68(%eax) ; X86-NEXT: vzeroupper @@ -4188,12 +4229,12 @@ ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testw %ax, %ax -; KNL-NEXT: jle LBB65_1 +; KNL-NEXT: jle LBB66_1 ; KNL-NEXT: ## %bb.2: ## %bb.2 ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB65_1: ## %bb.1 +; KNL-NEXT: LBB66_1: ## %bb.1 ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4207,12 +4248,12 @@ ; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testw %ax, %ax -; SKX-NEXT: jle LBB65_1 +; SKX-NEXT: jle LBB66_1 ; SKX-NEXT: ## %bb.2: ## %bb.2 ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB65_1: ## %bb.1 +; SKX-NEXT: LBB66_1: ## %bb.1 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4226,12 +4267,12 @@ ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testw %ax, %ax -; AVX512BW-NEXT: jle LBB65_1 +; AVX512BW-NEXT: jle LBB66_1 ; AVX512BW-NEXT: ## %bb.2: ## %bb.2 ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB65_1: ## %bb.1 +; AVX512BW-NEXT: LBB66_1: ## %bb.1 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4245,12 +4286,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: testw %ax, %ax -; AVX512DQ-NEXT: jle LBB65_1 +; AVX512DQ-NEXT: jle LBB66_1 ; AVX512DQ-NEXT: ## %bb.2: ## %bb.2 ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB65_1: ## %bb.1 +; AVX512DQ-NEXT: LBB66_1: ## %bb.1 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4264,12 +4305,12 @@ ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: testw %ax, %ax -; X86-NEXT: jle LBB65_1 +; X86-NEXT: jle LBB66_1 ; X86-NEXT: ## %bb.2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB65_1: ## %bb.1 +; X86-NEXT: LBB66_1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4297,11 +4338,11 @@ ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kortestw %k0, %k0 -; CHECK-NEXT: jb LBB66_2 +; CHECK-NEXT: jb LBB67_2 ; CHECK-NEXT: ## %bb.1: ## %bb.1 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _foo -; CHECK-NEXT: LBB66_2: ## %bb.2 +; CHECK-NEXT: LBB67_2: ## %bb.2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -4313,11 +4354,11 @@ ; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kortestw %k0, %k0 -; X86-NEXT: jb LBB66_2 +; X86-NEXT: jb LBB67_2 ; X86-NEXT: ## %bb.1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo -; X86-NEXT: LBB66_2: ## %bb.2 +; X86-NEXT: LBB67_2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -4505,12 +4546,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB72_1 +; KNL-NEXT: je LBB73_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB72_1: ## %bar +; KNL-NEXT: LBB73_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4527,12 +4568,12 @@ ; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB72_1 +; SKX-NEXT: je LBB73_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB72_1: ## %bar +; SKX-NEXT: LBB73_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4555,12 +4596,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB72_1 +; AVX512BW-NEXT: je LBB73_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB72_1: ## %bar +; AVX512BW-NEXT: LBB73_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4581,12 +4622,12 @@ ; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: korb %k3, %k2, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB72_1 +; AVX512DQ-NEXT: je LBB73_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB72_1: ## %bar +; AVX512DQ-NEXT: LBB73_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4603,12 +4644,12 @@ ; X86-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB72_1 +; X86-NEXT: je LBB73_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB72_1: ## %bar +; X86-NEXT: LBB73_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4646,12 +4687,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB73_1 +; KNL-NEXT: je LBB74_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB73_1: ## %bar +; KNL-NEXT: LBB74_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4668,12 +4709,12 @@ ; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB73_1 +; SKX-NEXT: je LBB74_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB73_1: ## %bar +; SKX-NEXT: LBB74_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4692,12 +4733,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB73_1 +; AVX512BW-NEXT: je LBB74_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB73_1: ## %bar +; AVX512BW-NEXT: LBB74_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4714,12 +4755,12 @@ ; AVX512DQ-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korb %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB73_1 +; AVX512DQ-NEXT: je LBB74_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB73_1: ## %bar +; AVX512DQ-NEXT: LBB74_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4736,12 +4777,12 @@ ; X86-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB73_1 +; X86-NEXT: je LBB74_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB73_1: ## %bar +; X86-NEXT: LBB74_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4778,12 +4819,12 @@ ; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: je LBB74_1 +; KNL-NEXT: je LBB75_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB74_1: ## %bar +; KNL-NEXT: LBB75_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4800,12 +4841,12 @@ ; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; SKX-NEXT: korw %k2, %k1, %k1 ; SKX-NEXT: ktestw %k1, %k0 -; SKX-NEXT: je LBB74_1 +; SKX-NEXT: je LBB75_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB74_1: ## %bar +; SKX-NEXT: LBB75_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4823,12 +4864,12 @@ ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kortestw %k0, %k0 -; AVX512BW-NEXT: je LBB74_1 +; AVX512BW-NEXT: je LBB75_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB74_1: ## %bar +; AVX512BW-NEXT: LBB75_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4845,12 +4886,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestw %k1, %k0 -; AVX512DQ-NEXT: je LBB74_1 +; AVX512DQ-NEXT: je LBB75_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB74_1: ## %bar +; AVX512DQ-NEXT: LBB75_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4867,12 +4908,12 @@ ; X86-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; X86-NEXT: korw %k2, %k1, %k1 ; X86-NEXT: ktestw %k1, %k0 -; X86-NEXT: je LBB74_1 +; X86-NEXT: je LBB75_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB74_1: ## %bar +; X86-NEXT: LBB75_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4928,12 +4969,12 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB75_1 +; KNL-NEXT: je LBB76_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB75_1: ## %bar +; KNL-NEXT: LBB76_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4950,12 +4991,12 @@ ; SKX-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: ktestd %k1, %k0 -; SKX-NEXT: je LBB75_1 +; SKX-NEXT: je LBB76_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB75_1: ## %bar +; SKX-NEXT: LBB76_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4972,12 +5013,12 @@ ; AVX512BW-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: kord %k2, %k1, %k1 ; AVX512BW-NEXT: ktestd %k1, %k0 -; AVX512BW-NEXT: je LBB75_1 +; AVX512BW-NEXT: je LBB76_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB75_1: ## %bar +; AVX512BW-NEXT: LBB76_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -5014,12 +5055,12 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB75_1 +; AVX512DQ-NEXT: je LBB76_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB75_1: ## %bar +; AVX512DQ-NEXT: LBB76_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -5036,12 +5077,12 @@ ; X86-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; X86-NEXT: kord %k2, %k1, %k1 ; X86-NEXT: ktestd %k1, %k0 -; X86-NEXT: je LBB75_1 +; X86-NEXT: je LBB76_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB75_1: ## %bar +; X86-NEXT: LBB76_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -5121,12 +5162,12 @@ ; KNL-NEXT: orl %eax, %edx ; KNL-NEXT: shlq $32, %rdx ; KNL-NEXT: orq %rcx, %rdx -; KNL-NEXT: je LBB76_1 +; KNL-NEXT: je LBB77_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB76_1: ## %bar +; KNL-NEXT: LBB77_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -5143,12 +5184,12 @@ ; SKX-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; SKX-NEXT: korq %k2, %k1, %k1 ; SKX-NEXT: ktestq %k1, %k0 -; SKX-NEXT: je LBB76_1 +; SKX-NEXT: je LBB77_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB76_1: ## %bar +; SKX-NEXT: LBB77_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -5165,12 +5206,12 @@ ; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: korq %k2, %k1, %k1 ; AVX512BW-NEXT: ktestq %k1, %k0 -; AVX512BW-NEXT: je LBB76_1 +; AVX512BW-NEXT: je LBB77_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB76_1: ## %bar +; AVX512BW-NEXT: LBB77_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -5231,12 +5272,12 @@ ; AVX512DQ-NEXT: orl %eax, %edx ; AVX512DQ-NEXT: shlq $32, %rdx ; AVX512DQ-NEXT: orq %rcx, %rdx -; AVX512DQ-NEXT: je LBB76_1 +; AVX512DQ-NEXT: je LBB77_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB76_1: ## %bar +; AVX512DQ-NEXT: LBB77_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -5255,12 +5296,12 @@ ; X86-NEXT: kandq %k1, %k0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB76_1 +; X86-NEXT: je LBB77_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB76_1: ## %bar +; X86-NEXT: LBB77_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -5360,3 +5401,20 @@ %maskv = insertelement <64 x i1> , i1 %a_i, i32 0 ret <64 x i1> %maskv } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll @@ -130,6 +130,24 @@ ret i64 %div } +define i64 @div64_pgso(i64 %a, i64 %b) !prof !15 { +; CHECK-LABEL: div64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rsi +; CHECK-NEXT: retq +; +; HUGEWS-LABEL: div64_pgso: +; HUGEWS: # %bb.0: +; HUGEWS-NEXT: movq %rdi, %rax +; HUGEWS-NEXT: cqto +; HUGEWS-NEXT: idivq %rsi +; HUGEWS-NEXT: retq + %div = sdiv i64 %a, %b + ret i64 %div +} + define i64 @div64_hugews(i64 %a, i64 %b) { ; ATOM-LABEL: div64_hugews: ; ATOM: # %bb.0: @@ -137,12 +155,12 @@ ; ATOM-NEXT: movq %rdi, %rax ; ATOM-NEXT: orq %rsi, %rcx ; ATOM-NEXT: shrq $32, %rcx -; ATOM-NEXT: je .LBB3_1 +; ATOM-NEXT: je .LBB4_1 ; ATOM-NEXT: # %bb.2: ; ATOM-NEXT: cqto ; ATOM-NEXT: idivq %rsi ; ATOM-NEXT: retq -; ATOM-NEXT: .LBB3_1: +; ATOM-NEXT: .LBB4_1: ; ATOM-NEXT: # kill: def $eax killed $eax killed $rax ; ATOM-NEXT: xorl %edx, %edx ; ATOM-NEXT: divl %esi @@ -155,12 +173,12 @@ ; SLM-NEXT: movq %rdi, %rax ; SLM-NEXT: orq %rsi, %rcx ; SLM-NEXT: shrq $32, %rcx -; SLM-NEXT: je .LBB3_1 +; SLM-NEXT: je .LBB4_1 ; SLM-NEXT: # %bb.2: ; SLM-NEXT: cqto ; SLM-NEXT: idivq %rsi ; SLM-NEXT: retq -; SLM-NEXT: .LBB3_1: +; SLM-NEXT: .LBB4_1: ; SLM-NEXT: xorl %edx, %edx ; SLM-NEXT: # kill: def $eax killed $eax killed $rax ; SLM-NEXT: divl %esi @@ -173,12 +191,12 @@ ; SKL-NEXT: movq %rdi, %rcx ; SKL-NEXT: orq %rsi, %rcx ; SKL-NEXT: shrq $32, %rcx -; SKL-NEXT: je .LBB3_1 +; SKL-NEXT: je .LBB4_1 ; SKL-NEXT: # %bb.2: ; SKL-NEXT: cqto ; SKL-NEXT: idivq %rsi ; SKL-NEXT: retq -; SKL-NEXT: .LBB3_1: +; SKL-NEXT: .LBB4_1: ; SKL-NEXT: # kill: def $eax killed $eax killed $rax ; SKL-NEXT: xorl %edx, %edx ; SKL-NEXT: divl %esi @@ -213,6 +231,24 @@ ret i32 %div } +define i32 @div32_pgso(i32 %a, i32 %b) !prof !15 { +; CHECK-LABEL: div32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: retq +; +; HUGEWS-LABEL: div32_pgso: +; HUGEWS: # %bb.0: +; HUGEWS-NEXT: movl %edi, %eax +; HUGEWS-NEXT: cltd +; HUGEWS-NEXT: idivl %esi +; HUGEWS-NEXT: retq + %div = sdiv i32 %a, %b + ret i32 %div +} + define i32 @div32_minsize(i32 %a, i32 %b) minsize { ; CHECK-LABEL: div32_minsize: ; CHECK: # %bb.0: @@ -246,3 +282,4 @@ !12 = !{i32 10000, i64 1000, i32 1} !13 = !{i32 999000, i64 1000, i32 3} !14 = !{i32 999999, i64 5, i32 3} +!15 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll --- a/llvm/test/CodeGen/X86/cmov-into-branch.ll +++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll @@ -88,7 +88,7 @@ ; CHECK-NEXT: cmovnel %edi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 ret i32 %sel } @@ -104,7 +104,7 @@ ; CHECK-NEXT: .LBB6_2: # %select.end ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 ret i32 %sel } @@ -124,7 +124,7 @@ ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17 ret i32 %sel } @@ -137,12 +137,51 @@ ; CHECK-NEXT: cmovnel %edi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !18 ret i32 %sel } -!0 = !{!"branch_weights", i32 1, i32 99} -!1 = !{!"branch_weights", i32 1, i32 100} -!2 = !{!"branch_weights", i32 100, i32 1} -!3 = !{!"branch_weights", i32 0, i32 0} +define i32 @weighted_select_optsize(i32 %a, i32 %b) optsize { +; CHECK-LABEL: weighted_select_optsize: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} + +define i32 @weighted_select_pgso(i32 %a, i32 %b) !prof !14 { +; CHECK-LABEL: weighted_select_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} +!15 = !{!"branch_weights", i32 1, i32 99} +!16 = !{!"branch_weights", i32 1, i32 100} +!17 = !{!"branch_weights", i32 100, i32 1} +!18 = !{!"branch_weights", i32 0, i32 0} diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32 +; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64 +; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64 + +declare void @foo() +declare void @bar() + +define void @f(i32 %x, i32 %y) !prof !14 { +; CHECK32-LABEL: f: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; CHECK32-NEXT: cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x08] +; CHECK32-NEXT: jne bar # TAILCALL +; CHECK32-NEXT: # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %bb1 +; CHECK32-NEXT: jmp foo # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: f: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] +; CHECK64-NEXT: jne bar # TAILCALL +; CHECK64-NEXT: # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %bb1 +; CHECK64-NEXT: jmp foo # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: f: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: cmpl %edx, %ecx # encoding: [0x39,0xd1] +; WIN64-NEXT: jne bar # TAILCALL +; WIN64-NEXT: # encoding: [0x75,A] +; WIN64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %bb1 +; WIN64-NEXT: jmp foo # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +entry: + %p = icmp eq i32 %x, %y + br i1 %p, label %bb1, label %bb2 +bb1: + tail call void @foo() + ret void +bb2: + tail call void @bar() + ret void + +; Check that the asm doesn't just look good, but uses the correct encoding. +} + +define void @f_non_leaf(i32 %x, i32 %y) !prof !14 { +; CHECK32-LABEL: f_non_leaf: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: pushl %ebx # encoding: [0x53] +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: .cfi_offset %ebx, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; CHECK32-NEXT: #APP +; CHECK32-NEXT: #NO_APP +; CHECK32-NEXT: cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c] +; CHECK32-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %bb1 +; CHECK32-NEXT: popl %ebx # encoding: [0x5b] +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: jmp foo # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB1_2: # %bb2 +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: popl %ebx # encoding: [0x5b] +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: jmp bar # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: f_non_leaf: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: pushq %rbx # encoding: [0x53] +; CHECK64-NEXT: .cfi_def_cfa_offset 16 +; CHECK64-NEXT: .cfi_offset %rbx, -16 +; CHECK64-NEXT: #APP +; CHECK64-NEXT: #NO_APP +; CHECK64-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] +; CHECK64-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %bb1 +; CHECK64-NEXT: popq %rbx # encoding: [0x5b] +; CHECK64-NEXT: .cfi_def_cfa_offset 8 +; CHECK64-NEXT: jmp foo # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB1_2: # %bb2 +; CHECK64-NEXT: .cfi_def_cfa_offset 16 +; CHECK64-NEXT: popq %rbx # encoding: [0x5b] +; CHECK64-NEXT: .cfi_def_cfa_offset 8 +; CHECK64-NEXT: jmp bar # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: f_non_leaf: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: pushq %rbx # encoding: [0x53] +; WIN64-NEXT: .seh_pushreg %rbx +; WIN64-NEXT: .seh_endprologue +; WIN64-NEXT: #APP +; WIN64-NEXT: #NO_APP +; WIN64-NEXT: cmpl %edx, %ecx # encoding: [0x39,0xd1] +; WIN64-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %bb1 +; WIN64-NEXT: popq %rbx # encoding: [0x5b] +; WIN64-NEXT: jmp foo # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; WIN64-NEXT: .LBB1_2: # %bb2 +; WIN64-NEXT: nop # encoding: [0x90] +; WIN64-NEXT: popq %rbx # encoding: [0x5b] +; WIN64-NEXT: jmp bar # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; WIN64-NEXT: .seh_handlerdata +; WIN64-NEXT: .text +; WIN64-NEXT: .seh_endproc +entry: + ; Force %ebx to be spilled on the stack, turning this into + ; not a "leaf" function for Win64. + tail call void asm sideeffect "", "~{ebx}"() + + %p = icmp eq i32 %x, %y + br i1 %p, label %bb1, label %bb2 +bb1: + tail call void @foo() + ret void +bb2: + tail call void @bar() + ret void + +} + +declare x86_thiscallcc zeroext i1 @baz(i8*, i32) +define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) !prof !14 { +; CHECK32-LABEL: BlockPlacementTest: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] +; CHECK32-NEXT: testb $42, %dl # encoding: [0xf6,0xc2,0x2a] +; CHECK32-NEXT: je .LBB2_3 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %land.rhs +; CHECK32-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; CHECK32-NEXT: testb $44, %dl # encoding: [0xf6,0xc2,0x2c] +; CHECK32-NEXT: je baz # TAILCALL +; CHECK32-NEXT: # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB2_2: # %land.end +; CHECK32-NEXT: # kill: def $al killed $al killed $eax +; CHECK32-NEXT: retl $4 # encoding: [0xc2,0x04,0x00] +; CHECK32-NEXT: .LBB2_3: +; CHECK32-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK32-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: BlockPlacementTest: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: testb $42, %sil # encoding: [0x40,0xf6,0xc6,0x2a] +; CHECK64-NEXT: je .LBB2_3 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %land.rhs +; CHECK64-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; CHECK64-NEXT: testb $44, %sil # encoding: [0x40,0xf6,0xc6,0x2c] +; CHECK64-NEXT: je baz # TAILCALL +; CHECK64-NEXT: # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB2_2: # %land.end +; CHECK64-NEXT: # kill: def $al killed $al killed $eax +; CHECK64-NEXT: retq # encoding: [0xc3] +; CHECK64-NEXT: .LBB2_3: +; CHECK64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK64-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: BlockPlacementTest: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: testb $42, %dl # encoding: [0xf6,0xc2,0x2a] +; WIN64-NEXT: je .LBB2_3 # encoding: [0x74,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %land.rhs +; WIN64-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; WIN64-NEXT: testb $44, %dl # encoding: [0xf6,0xc2,0x2c] +; WIN64-NEXT: je baz # TAILCALL +; WIN64-NEXT: # encoding: [0x74,A] +; WIN64-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; WIN64-NEXT: .LBB2_2: # %land.end +; WIN64-NEXT: # kill: def $al killed $al killed $eax +; WIN64-NEXT: retq # encoding: [0xc3] +; WIN64-NEXT: .LBB2_3: +; WIN64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; WIN64-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +entry: + %and = and i32 %x, 42 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %land.end, label %land.rhs + +land.rhs: + %and6 = and i32 %x, 44 + %tobool7 = icmp eq i32 %and6, 0 + br i1 %tobool7, label %lor.rhs, label %land.end + +lor.rhs: + %call = tail call x86_thiscallcc zeroext i1 @baz(i8* %this, i32 %x) #2 + br label %land.end + +land.end: + %0 = phi i1 [ false, %entry ], [ true, %land.rhs ], [ %call, %lor.rhs ] + ret i1 %0 + +; Make sure machine block placement isn't confused by the conditional tail call, +; but sees that it can fall through to the next block. +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll --- a/llvm/test/CodeGen/X86/fixup-lea.ll +++ b/llvm/test/CodeGen/X86/fixup-lea.ll @@ -108,17 +108,96 @@ ret void } +define void @foo_pgso(i32 inreg %dns) !prof !14 { +; SLOW-LABEL: foo_pgso: +; SLOW: # %bb.0: # %entry +; SLOW-NEXT: xorl %ecx, %ecx +; SLOW-NEXT: decl %ecx +; SLOW-NEXT: .LBB4_1: # %for.body +; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 +; SLOW-NEXT: movzwl %cx, %edx +; SLOW-NEXT: decl %ecx +; SLOW-NEXT: cmpl %eax, %edx +; SLOW-NEXT: jl .LBB4_1 +; SLOW-NEXT: # %bb.2: # %for.end +; SLOW-NEXT: retl +; +; FAST-LABEL: foo_pgso: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xorl %ecx, %ecx +; FAST-NEXT: decl %ecx +; FAST-NEXT: .LBB4_1: # %for.body +; FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-NEXT: movzwl %cx, %edx +; FAST-NEXT: addl $-1, %ecx +; FAST-NEXT: cmpl %eax, %edx +; FAST-NEXT: jl .LBB4_1 +; FAST-NEXT: # %bb.2: # %for.end +; FAST-NEXT: retl +entry: + br label %for.body + +for.body: + %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ] + %dec = add i16 %i.05, -1 + %conv = zext i16 %dec to i32 + %cmp = icmp slt i32 %conv, %dns + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +define void @bar_pgso(i32 inreg %dns) !prof !14 { +; SLOW-LABEL: bar_pgso: +; SLOW: # %bb.0: # %entry +; SLOW-NEXT: xorl %ecx, %ecx +; SLOW-NEXT: incl %ecx +; SLOW-NEXT: .LBB5_1: # %for.body +; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 +; SLOW-NEXT: movzwl %cx, %edx +; SLOW-NEXT: incl %ecx +; SLOW-NEXT: cmpl %eax, %edx +; SLOW-NEXT: jl .LBB5_1 +; SLOW-NEXT: # %bb.2: # %for.end +; SLOW-NEXT: retl +; +; FAST-LABEL: bar_pgso: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xorl %ecx, %ecx +; FAST-NEXT: incl %ecx +; FAST-NEXT: .LBB5_1: # %for.body +; FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-NEXT: movzwl %cx, %edx +; FAST-NEXT: addl $1, %ecx +; FAST-NEXT: cmpl %eax, %edx +; FAST-NEXT: jl .LBB5_1 +; FAST-NEXT: # %bb.2: # %for.end +; FAST-NEXT: retl +entry: + br label %for.body + +for.body: + %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ] + %inc = add i16 %i.05, 1 + %conv = zext i16 %inc to i32 + %cmp = icmp slt i32 %conv, %dns + br i1 %cmp, label %for.body, label %for.end +for.end: + ret void +} + define void @foo_nosize(i32 inreg %dns) { ; SLOW-LABEL: foo_nosize: ; SLOW: # %bb.0: # %entry ; SLOW-NEXT: movw $-1, %cx ; SLOW-NEXT: .p2align 4, 0x90 -; SLOW-NEXT: .LBB4_1: # %for.body +; SLOW-NEXT: .LBB6_1: # %for.body ; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 ; SLOW-NEXT: movzwl %cx, %edx ; SLOW-NEXT: decl %ecx ; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB4_1 +; SLOW-NEXT: jl .LBB6_1 ; SLOW-NEXT: # %bb.2: # %for.end ; SLOW-NEXT: retl ; @@ -126,12 +205,12 @@ ; FAST: # %bb.0: # %entry ; FAST-NEXT: movw $-1, %cx ; FAST-NEXT: .p2align 4, 0x90 -; FAST-NEXT: .LBB4_1: # %for.body +; FAST-NEXT: .LBB6_1: # %for.body ; FAST-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-NEXT: movzwl %cx, %edx ; FAST-NEXT: addl $-1, %ecx ; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB4_1 +; FAST-NEXT: jl .LBB6_1 ; FAST-NEXT: # %bb.2: # %for.end ; FAST-NEXT: retl entry: @@ -153,12 +232,12 @@ ; SLOW: # %bb.0: # %entry ; SLOW-NEXT: movw $1, %cx ; SLOW-NEXT: .p2align 4, 0x90 -; SLOW-NEXT: .LBB5_1: # %for.body +; SLOW-NEXT: .LBB7_1: # %for.body ; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 ; SLOW-NEXT: movzwl %cx, %edx ; SLOW-NEXT: incl %ecx ; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB5_1 +; SLOW-NEXT: jl .LBB7_1 ; SLOW-NEXT: # %bb.2: # %for.end ; SLOW-NEXT: retl ; @@ -166,12 +245,12 @@ ; FAST: # %bb.0: # %entry ; FAST-NEXT: movw $1, %cx ; FAST-NEXT: .p2align 4, 0x90 -; FAST-NEXT: .LBB5_1: # %for.body +; FAST-NEXT: .LBB7_1: # %for.body ; FAST-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-NEXT: movzwl %cx, %edx ; FAST-NEXT: addl $1, %ecx ; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB5_1 +; FAST-NEXT: jl .LBB7_1 ; FAST-NEXT: # %bb.2: # %for.end ; FAST-NEXT: retl entry: @@ -186,3 +265,20 @@ for.end: ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll --- a/llvm/test/CodeGen/X86/fold-load-unops.ll +++ b/llvm/test/CodeGen/X86/fold-load-unops.ll @@ -113,6 +113,38 @@ ret <4 x float> %res } +define float @rcpss_pgso(float* %a) !prof !14 { +; SSE-LABEL: rcpss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rcpss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rcpss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: rcpss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rcpss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rcpss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) + ret <4 x float> %res +} + define float @rsqrtss_size(float* %a) optsize { ; SSE-LABEL: rsqrtss_size: ; SSE: # %bb.0: @@ -145,6 +177,38 @@ ret <4 x float> %res } +define float @rsqrtss_pgso(float* %a) !prof !14 { +; SSE-LABEL: rsqrtss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rsqrtss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: rsqrtss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rsqrtss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + define float @sqrtss_size(float* %a) optsize{ ; SSE-LABEL: sqrtss_size: ; SSE: # %bb.0: @@ -196,6 +260,57 @@ ret <4 x float> %res } +define float @sqrtss_pgso(float* %a) !prof !14 { +; SSE-LABEL: sqrtss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: sqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: sqrtss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + +define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 { +; SSE-LABEL: sqrtss_full_pgso_volatile: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_full_pgso_volatile: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load volatile <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + define double @sqrtsd_size(double* %a) optsize { ; SSE-LABEL: sqrtsd_size: ; SSE: # %bb.0: @@ -247,7 +362,75 @@ ret <2 x double> %res } +define double @sqrtsd_pgso(double* %a) !prof !14 { +; SSE-LABEL: sqrtsd_pgso: +; SSE: # %bb.0: +; SSE-NEXT: sqrtsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load double, double* %a + %ins = insertelement <2 x double> undef, double %ld, i32 0 + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) + %ext = extractelement <2 x double> %res, i32 0 + ret double %ext +} + +define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 { +; SSE-LABEL: sqrtsd_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x double>, <2 x double>* %a + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) + ret <2 x double> %res +} + +define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 { +; SSE-LABEL: sqrtsd_full_pgso_volatile: +; SSE: # %bb.0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_full_pgso_volatile: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load volatile <2 x double>, <2 x double>* %a + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) + ret <2 x double> %res +} + declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -196,6 +196,26 @@ ret i32 %tmp } +define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 { +; X86-LABEL: var_shift_i32_pgso: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: var_shift_i32_pgso: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shldl %cl, %esi, %eax +; X64-NEXT: retq + %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) + ret i32 %tmp +} + define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: @@ -216,36 +236,36 @@ ; X86-FAST-NEXT: shll %cl, %edi ; X86-FAST-NEXT: shldl %cl, %eax, %ebp ; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB4_2 +; X86-FAST-NEXT: je .LBB5_2 ; X86-FAST-NEXT: # %bb.1: ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB4_2: +; X86-FAST-NEXT: .LBB5_2: ; X86-FAST-NEXT: movb $64, %cl ; X86-FAST-NEXT: subb %bl, %cl ; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: shrl %cl, %esi ; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: jne .LBB4_3 +; X86-FAST-NEXT: jne .LBB5_3 ; X86-FAST-NEXT: # %bb.4: ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: jne .LBB4_6 -; X86-FAST-NEXT: jmp .LBB4_7 -; X86-FAST-NEXT: .LBB4_3: +; X86-FAST-NEXT: jne .LBB5_6 +; X86-FAST-NEXT: jmp .LBB5_7 +; X86-FAST-NEXT: .LBB5_3: ; X86-FAST-NEXT: movl %esi, %ecx ; X86-FAST-NEXT: xorl %esi, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB4_7 -; X86-FAST-NEXT: .LBB4_6: +; X86-FAST-NEXT: je .LBB5_7 +; X86-FAST-NEXT: .LBB5_6: ; X86-FAST-NEXT: orl %esi, %ebp ; X86-FAST-NEXT: orl %ecx, %edi ; X86-FAST-NEXT: movl %edi, %eax ; X86-FAST-NEXT: movl %ebp, %edx -; X86-FAST-NEXT: .LBB4_7: +; X86-FAST-NEXT: .LBB5_7: ; X86-FAST-NEXT: addl $4, %esp ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi @@ -279,11 +299,11 @@ ; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: je .LBB4_2 +; X86-SLOW-NEXT: je .LBB5_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %eax, %ebp ; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB4_2: +; X86-SLOW-NEXT: .LBB5_2: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: movl %ebp, %eax ; X86-SLOW-NEXT: movl %ebx, %ecx @@ -294,41 +314,41 @@ ; X86-SLOW-NEXT: negb %cl ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: je .LBB4_4 +; X86-SLOW-NEXT: je .LBB5_4 ; X86-SLOW-NEXT: # %bb.3: ; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: movl %eax, %ebp -; X86-SLOW-NEXT: .LBB4_4: +; X86-SLOW-NEXT: .LBB5_4: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB4_6 +; X86-SLOW-NEXT: je .LBB5_6 ; X86-SLOW-NEXT: # %bb.5: ; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: xorl %edi, %edi -; X86-SLOW-NEXT: .LBB4_6: +; X86-SLOW-NEXT: .LBB5_6: ; X86-SLOW-NEXT: movb %dh, %cl ; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: jne .LBB4_7 +; X86-SLOW-NEXT: jne .LBB5_7 ; X86-SLOW-NEXT: # %bb.8: ; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB4_10 -; X86-SLOW-NEXT: jmp .LBB4_11 -; X86-SLOW-NEXT: .LBB4_7: +; X86-SLOW-NEXT: jne .LBB5_10 +; X86-SLOW-NEXT: jmp .LBB5_11 +; X86-SLOW-NEXT: .LBB5_7: ; X86-SLOW-NEXT: movl %esi, %ecx ; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB4_11 -; X86-SLOW-NEXT: .LBB4_10: +; X86-SLOW-NEXT: je .LBB5_11 +; X86-SLOW-NEXT: .LBB5_10: ; X86-SLOW-NEXT: orl %esi, %ebp ; X86-SLOW-NEXT: orl %ecx, %edi ; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: .LBB4_11: +; X86-SLOW-NEXT: .LBB5_11: ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi @@ -503,3 +523,20 @@ %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7) ret i64 %tmp } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -195,6 +195,26 @@ ret i32 %tmp } +define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 { +; X86-LABEL: var_shift_i32_pgso: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: var_shift_i32_pgso: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrdl %cl, %edi, %eax +; X64-NEXT: retq + %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) + ret i32 %tmp +} + define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: @@ -216,30 +236,30 @@ ; X86-FAST-NEXT: shll %cl, %edi ; X86-FAST-NEXT: shldl %cl, %eax, %esi ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: je .LBB4_2 +; X86-FAST-NEXT: je .LBB5_2 ; X86-FAST-NEXT: # %bb.1: ; X86-FAST-NEXT: movl %edi, %esi ; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB4_2: +; X86-FAST-NEXT: .LBB5_2: ; X86-FAST-NEXT: movl %edx, %ebp ; X86-FAST-NEXT: movl %ebx, %ecx ; X86-FAST-NEXT: shrl %cl, %ebp ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: shrdl %cl, %edx, %eax ; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB4_4 +; X86-FAST-NEXT: je .LBB5_4 ; X86-FAST-NEXT: # %bb.3: ; X86-FAST-NEXT: movl %ebp, %eax ; X86-FAST-NEXT: xorl %ebp, %ebp -; X86-FAST-NEXT: .LBB4_4: +; X86-FAST-NEXT: .LBB5_4: ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB4_6 +; X86-FAST-NEXT: je .LBB5_6 ; X86-FAST-NEXT: # %bb.5: ; X86-FAST-NEXT: orl %ebp, %esi ; X86-FAST-NEXT: orl %eax, %edi ; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: .LBB4_6: +; X86-FAST-NEXT: .LBB5_6: ; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-FAST-NEXT: addl $4, %esp ; X86-FAST-NEXT: popl %esi @@ -274,11 +294,11 @@ ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB4_2 +; X86-SLOW-NEXT: je .LBB5_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %edi, %edx ; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB4_2: +; X86-SLOW-NEXT: .LBB5_2: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl %ecx, %edx ; X86-SLOW-NEXT: movl %ebx, %ecx @@ -291,41 +311,41 @@ ; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: testb %ah, %ah ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB4_4 +; X86-SLOW-NEXT: je .LBB5_4 ; X86-SLOW-NEXT: # %bb.3: ; X86-SLOW-NEXT: orl %edx, %edi ; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: .LBB4_4: +; X86-SLOW-NEXT: .LBB5_4: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB4_6 +; X86-SLOW-NEXT: je .LBB5_6 ; X86-SLOW-NEXT: # %bb.5: ; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: xorl %edi, %edi -; X86-SLOW-NEXT: .LBB4_6: +; X86-SLOW-NEXT: .LBB5_6: ; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: jne .LBB4_7 +; X86-SLOW-NEXT: jne .LBB5_7 ; X86-SLOW-NEXT: # %bb.8: ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB4_10 -; X86-SLOW-NEXT: jmp .LBB4_11 -; X86-SLOW-NEXT: .LBB4_7: +; X86-SLOW-NEXT: jne .LBB5_10 +; X86-SLOW-NEXT: jmp .LBB5_11 +; X86-SLOW-NEXT: .LBB5_7: ; X86-SLOW-NEXT: movl %esi, %eax ; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB4_11 -; X86-SLOW-NEXT: .LBB4_10: +; X86-SLOW-NEXT: je .LBB5_11 +; X86-SLOW-NEXT: .LBB5_10: ; X86-SLOW-NEXT: orl %ebp, %esi ; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: .LBB4_11: +; X86-SLOW-NEXT: .LBB5_11: ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi @@ -499,3 +519,20 @@ %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7) ret i64 %tmp } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1983,6 +1983,80 @@ ret float %x230 } +define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_4_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_4_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> + %x227 = fadd <4 x float> %x225, %x226 + %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> + %x229 = fadd <4 x float> %x227, %x228 + %x230 = extractelement <4 x float> %x229, i32 0 + ret float %x230 +} + +define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_8_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> + %x227 = fadd <8 x float> %x225, %x226 + %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> + %x229 = fadd <8 x float> %x227, %x228 + %x230 = extractelement <8 x float> %x229, i32 0 + ret float %x230 +} + +define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_16_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> + %x227 = fadd <16 x float> %x225, %x226 + %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> + %x229 = fadd <16 x float> %x227, %x228 + %x230 = extractelement <16 x float> %x229, i32 0 + ret float %x230 +} + define float @partial_reduction_fadd_v8f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32: ; SSE3-SLOW: # %bb.0: @@ -2115,3 +2189,20 @@ %r = extractelement <16 x float> %x0123, i32 0 ret float %r } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll --- a/llvm/test/CodeGen/X86/immediate_merging.ll +++ b/llvm/test/CodeGen/X86/immediate_merging.ll @@ -73,6 +73,68 @@ ret i32 0 } +; Test PGSO to make sure immediates with multiple users don't get pulled in to +; instructions. +define i32 @foo_pgso() !prof !14 { +; X86-LABEL: foo_pgso: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $1234, %eax # imm = 0x4D2 +; X86-NEXT: movl %eax, a +; X86-NEXT: movl %eax, b +; X86-NEXT: movl $12, %eax +; X86-NEXT: movl %eax, c +; X86-NEXT: cmpl %eax, e +; X86-NEXT: jne .LBB1_2 +; X86-NEXT: # %bb.1: # %if.then +; X86-NEXT: movl $1, x +; X86-NEXT: .LBB1_2: # %if.end +; X86-NEXT: movl $1234, f # imm = 0x4D2 +; X86-NEXT: movl $555, %eax # imm = 0x22B +; X86-NEXT: movl %eax, h +; X86-NEXT: addl %eax, i +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: foo_pgso: +; X64: # %bb.0: # %entry +; X64-NEXT: movl $1234, %eax # imm = 0x4D2 +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: movl $12, %eax +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: cmpl %eax, {{.*}}(%rip) +; X64-NEXT: jne .LBB1_2 +; X64-NEXT: # %bb.1: # %if.then +; X64-NEXT: movl $1, {{.*}}(%rip) +; X64-NEXT: .LBB1_2: # %if.end +; X64-NEXT: movl $1234, {{.*}}(%rip) # imm = 0x4D2 +; X64-NEXT: movl $555, %eax # imm = 0x22B +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: addl %eax, {{.*}}(%rip) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +entry: + store i32 1234, i32* @a + store i32 1234, i32* @b + store i32 12, i32* @c + %0 = load i32, i32* @e + %cmp = icmp eq i32 %0, 12 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* @x + br label %if.end + +; New block.. Make sure 1234 isn't live across basic blocks from before. +if.end: ; preds = %if.then, %entry + store i32 1234, i32* @f + store i32 555, i32* @h + %1 = load i32, i32* @i + %add1 = add nsw i32 %1, 555 + store i32 %add1, i32* @i + ret i32 0 +} + ; Test -O2 to make sure that all immediates get pulled in to their users. define i32 @foo2() { ; X86-LABEL: foo2: @@ -124,3 +186,47 @@ call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false) ret void } + +; memset gets lowered in DAG. Constant merging should hoist all the +; immediates used to store to the individual memory locations. Make +; sure we don't directly store the immediates. +define void @foomemset_pgso() !prof !14 { +; X86-LABEL: foomemset_pgso: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $555819297, %eax # imm = 0x21212121 +; X86-NEXT: movl %eax, AA+20 +; X86-NEXT: movl %eax, AA+16 +; X86-NEXT: movl %eax, AA+12 +; X86-NEXT: movl %eax, AA+8 +; X86-NEXT: movl %eax, AA+4 +; X86-NEXT: movl %eax, AA +; X86-NEXT: retl +; +; X64-LABEL: foomemset_pgso: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121 +; X64-NEXT: movq %rax, AA+{{.*}}(%rip) +; X64-NEXT: movq %rax, AA+{{.*}}(%rip) +; X64-NEXT: movq %rax, {{.*}}(%rip) +; X64-NEXT: retq +entry: + call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/immediate_merging64.ll b/llvm/test/CodeGen/X86/immediate_merging64.ll --- a/llvm/test/CodeGen/X86/immediate_merging64.ll +++ b/llvm/test/CodeGen/X86/immediate_merging64.ll @@ -19,6 +19,19 @@ ret i1 %cmp } +define i1 @imm_multiple_users_pgso(i64 %a, i64* %b) !prof !14 { +; CHECK-LABEL: imm_multiple_users_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq + store i64 -1, i64* %b, align 8 + %cmp = icmp eq i64 %a, -1 + ret i1 %cmp +} + declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) ; Inlined memsets requiring multiple same-sized stores should be lowered using @@ -34,3 +47,31 @@ tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false) ret void } + +define void @memset_zero_pgso(i8* noalias nocapture %D) !prof !14 { +; CHECK-LABEL: memset_zero_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movq %rax, 7(%rdi) +; CHECK-NEXT: movq %rax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/loop-blocks.ll b/llvm/test/CodeGen/X86/loop-blocks.ll --- a/llvm/test/CodeGen/X86/loop-blocks.ll +++ b/llvm/test/CodeGen/X86/loop-blocks.ll @@ -269,6 +269,35 @@ attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } +; CHECK-LABEL: slightly_more_involved_2_pgso: +; CHECK-NOT: jmp .LBB6_1 +; CHECK: .LBB6_1: +; CHECK-NEXT: callq body + +define void @slightly_more_involved_2_pgso() norecurse nounwind readnone uwtable !prof !14 { +entry: + br label %loop + +loop: + call void @body() + %t0 = call i32 @get() + %t1 = icmp slt i32 %t0, 2 + br i1 %t1, label %block_a, label %bb + +bb: + %t2 = call i32 @get() + %t3 = icmp slt i32 %t2, 99 + br i1 %t3, label %exit, label %loop + +block_a: + call void @bar99() + br label %loop + +exit: + call void @exit() + ret void +} + declare void @bar99() nounwind declare void @bar100() nounwind declare void @bar101() nounwind @@ -281,3 +310,20 @@ declare void @block_a_true_func() nounwind declare void @block_a_false_func() nounwind declare void @block_a_merge_func() nounwind + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll --- a/llvm/test/CodeGen/X86/materialize.ll +++ b/llvm/test/CodeGen/X86/materialize.ll @@ -30,6 +30,21 @@ ; CHECK64-NEXT: retq } +define i32 @one32_pgso() !prof !14 { +entry: + ret i32 1 + +; CHECK32-LABEL: one32_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl + +; FIXME: Figure out the best approach in 64-bit mode. +; CHECK64-LABEL: one32_pgso: +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + define i32 @one32_minsize() minsize { entry: ret i32 1 @@ -107,6 +122,16 @@ ; CHECK32-NEXT: retl } +define i32 @minus_one32_pgso() !prof !14 { +entry: + ret i32 -1 + +; CHECK32-LABEL: minus_one32_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + define i32 @minus_one32_minsize() minsize { entry: ret i32 -1 @@ -140,6 +165,28 @@ ; CHECK32-NEXT: retl } +define i16 @one16_pgso() !prof !14 { +entry: + ret i16 1 + +; CHECK32-LABEL: one16_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: # kill +; CHECK32-NEXT: retl +} + +define i16 @minus_one16_pgso() !prof !14 { +entry: + ret i16 -1 + +; CHECK32-LABEL: minus_one16_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: # kill +; CHECK32-NEXT: retl +} + define i32 @minus_five32() minsize { entry: ret i32 -5 @@ -213,4 +260,72 @@ ; CHECK32: retl } +define i32 @rematerialize_minus_one_pgso() !prof !14 { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; -1 should be re-materialized here instead of getting spilled above. + ret i32 -1 + +; CHECK32-LABEL: rematerialize_minus_one_pgso +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NOT: %eax +; CHECK32: retl +} + +define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; Define eflags. + %a = icmp ne i32 %x, 123 + %b = zext i1 %a to i32 + ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. + ; It must therefore not use the xor-dec lowering. + %c = select i1 %a, i32 %b, i32 -1 + ret i32 %c + +; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: cmpl +; CHECK32: setne +; CHECK32-NOT: xorl +; CHECK32: movl $-1 +; CHECK32: cmov +; CHECK32: retl +} + declare x86_thiscallcc void @f(i32) + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -0,0 +1,1065 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $2, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3: +; X86: # %bb.0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB4_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # %bb.0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # %bb.2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: xorw (%rsi), %ax +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorb 2(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orw %ax, %cx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # %bb.0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5: +; X86: # %bb.0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # %bb.2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB9_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # %bb.0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # %bb.2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: xorl (%rsi), %eax +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorb 4(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # %bb.0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl 8(%rsi), %ecx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: setae %al +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: setae %al +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length24: +; X64: # %bb.0: +; X64-NEXT: movl $24, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pmovmskb %xmm2, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovq 16(%rsi), %xmm2 +; X64-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq 16(%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # %bb.0: +; X64-NEXT: movl $32, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length32_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length32_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vptest %xmm0, %xmm0 +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # %bb.0: +; X64-NEXT: movl $64, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $.L.str, %esi +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: bcmp_length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: bcmp_length2: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -139,6 +139,36 @@ ret void } +define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 { +; LINUX-LABEL: test3_pgso: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movl $64, %edx +; LINUX-NEXT: jmp memcpy # TAILCALL +; +; DARWIN-LABEL: test3_pgso: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movq 56(%rsi), %rax +; DARWIN-NEXT: movq %rax, 56(%rdi) +; DARWIN-NEXT: movq 48(%rsi), %rax +; DARWIN-NEXT: movq %rax, 48(%rdi) +; DARWIN-NEXT: movq 40(%rsi), %rax +; DARWIN-NEXT: movq %rax, 40(%rdi) +; DARWIN-NEXT: movq 32(%rsi), %rax +; DARWIN-NEXT: movq %rax, 32(%rdi) +; DARWIN-NEXT: movq 24(%rsi), %rax +; DARWIN-NEXT: movq %rax, 24(%rdi) +; DARWIN-NEXT: movq 16(%rsi), %rax +; DARWIN-NEXT: movq %rax, 16(%rdi) +; DARWIN-NEXT: movq (%rsi), %rax +; DARWIN-NEXT: movq 8(%rsi), %rcx +; DARWIN-NEXT: movq %rcx, 8(%rdi) +; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: retq +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) + ret void +} + define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone { ; DARWIN-LABEL: test3_minsize: ; DARWIN: ## %bb.0: @@ -506,3 +536,20 @@ tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false) ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll --- a/llvm/test/CodeGen/X86/powi.ll +++ b/llvm/test/CodeGen/X86/powi.ll @@ -86,6 +86,39 @@ ret double %ret } +define double @pow_wrapper_pgso(double %a) !prof !14 { +; X86-X87-LABEL: pow_wrapper_pgso: +; X86-X87: # %bb.0: +; X86-X87-NEXT: subl $12, %esp +; X86-X87-NEXT: .cfi_def_cfa_offset 16 +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstpl (%esp) +; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-X87-NEXT: calll __powidf2 +; X86-X87-NEXT: addl $12, %esp +; X86-X87-NEXT: .cfi_def_cfa_offset 4 +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper_pgso: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powidf2 +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper_pgso: +; X64: # %bb.0: +; X64-NEXT: movl $15, %edi +; X64-NEXT: jmp __powidf2 # TAILCALL + %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + define double @pow_wrapper_minsize(double %a) minsize { ; X86-X87-LABEL: pow_wrapper_minsize: ; X86-X87: # %bb.0: @@ -124,3 +157,19 @@ declare double @llvm.powi.f64(double, i32) nounwind readonly +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll --- a/llvm/test/CodeGen/X86/rounding-ops.ll +++ b/llvm/test/CodeGen/X86/rounding-ops.ll @@ -252,3 +252,60 @@ %call = tail call double @trunc(double %x) nounwind readnone ret double %call } + +define float @test11_pgso(float* %xptr) nounwind !prof !14 { +; CHECK-SSE-LABEL: test11_pgso: +; CHECK-SSE: ## %bb.0: +; CHECK-SSE-NEXT: roundss $11, (%rdi), %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test11_pgso: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512-LABEL: test11_pgso: +; CHECK-AVX512: ## %bb.0: +; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: retq + %x = load float, float* %xptr + %call = tail call float @truncf(float %x) nounwind readnone + ret float %call +} + +define double @test12_pgso(double* %xptr) nounwind !prof !14 { +; CHECK-SSE-LABEL: test12_pgso: +; CHECK-SSE: ## %bb.0: +; CHECK-SSE-NEXT: roundsd $11, (%rdi), %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test12_pgso: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512-LABEL: test12_pgso: +; CHECK-AVX512: ## %bb.0: +; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: retq + %x = load double, double* %xptr + %call = tail call double @trunc(double %x) nounwind readnone + ret double %call +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s + +declare void @bar() + +define void @test1(i32* nocapture %X) nounwind !prof !14 { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, (%rdi) +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %tmp1 = load i32, i32* %X, align 4 + %and = and i32 %tmp1, 255 + %cmp = icmp eq i32 %and, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test2(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test3(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 255 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; PR16083 +define i1 @test4(i64 %a, i32 %b) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: je .LBB3_1 +; CHECK-NEXT: # %bb.2: # %lor.end +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB3_1: # %lor.rhs +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %tobool = icmp ne i32 %b, 0 + br i1 %tobool, label %lor.end, label %lor.rhs + +lor.rhs: ; preds = %entry + %and = and i64 0, %a + %tobool1 = icmp ne i64 %and, 0 + br label %lor.end + +lor.end: ; preds = %lor.rhs, %entry + %p = phi i1 [ true, %entry ], [ %tobool1, %lor.rhs ] + ret i1 %p +} + +@x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4 + +; PR16551 +define void @test5(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl x+{{.*}}(%rip), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movzwl x+{{.*}}(%rip), %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: jne bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4 + %bf.lshr = lshr i56 %bf.load, 32 + %bf.cast = trunc i56 %bf.lshr to i32 + %cmp = icmp ne i32 %bf.cast, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test2_1(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: cmpl $256, %eax # imm = 0x100 +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 256 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_1(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_47(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_47: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_127(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_127: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $127, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg1(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg2(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-2, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -2 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg127(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg127: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-127, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-128, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -128 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_255(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_255: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 255 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll --- a/llvm/test/CodeGen/X86/slow-incdec.ll +++ b/llvm/test/CodeGen/X86/slow-incdec.ll @@ -54,6 +54,26 @@ ret i32 %r } +define i32 @inc_pgso(i32 %x) !prof !14 { +; CHECK-LABEL: inc_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: incl %eax +; CHECK-NEXT: retl + %r = add i32 %x, 1 + ret i32 %r +} + +define i32 @dec_pgso(i32 %x) !prof !14 { +; CHECK-LABEL: dec_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: decl %eax +; CHECK-NEXT: retl + %r = add i32 %x, -1 + ret i32 %r +} + declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) declare void @other(i32* ) nounwind; @@ -62,20 +82,20 @@ ; INCDEC: # %bb.0: # %entry ; INCDEC-NEXT: movl {{[0-9]+}}(%esp), %eax ; INCDEC-NEXT: incl (%eax) -; INCDEC-NEXT: jne .LBB4_1 +; INCDEC-NEXT: jne .LBB6_1 ; INCDEC-NEXT: # %bb.2: # %if.end4 ; INCDEC-NEXT: jmp other # TAILCALL -; INCDEC-NEXT: .LBB4_1: # %return +; INCDEC-NEXT: .LBB6_1: # %return ; INCDEC-NEXT: retl ; ; ADD-LABEL: cond_ae_to_cond_ne: ; ADD: # %bb.0: # %entry ; ADD-NEXT: movl {{[0-9]+}}(%esp), %eax ; ADD-NEXT: addl $1, (%eax) -; ADD-NEXT: jne .LBB4_1 +; ADD-NEXT: jne .LBB6_1 ; ADD-NEXT: # %bb.2: # %if.end4 ; ADD-NEXT: jmp other # TAILCALL -; ADD-NEXT: .LBB4_1: # %return +; ADD-NEXT: .LBB6_1: # %return ; ADD-NEXT: retl entry: %t0 = load i32, i32* %p, align 8 @@ -109,10 +129,10 @@ ; INCDEC-NEXT: incb a ; INCDEC-NEXT: sete d ; INCDEC-NEXT: testb %al, %al -; INCDEC-NEXT: jne .LBB5_2 +; INCDEC-NEXT: jne .LBB7_2 ; INCDEC-NEXT: # %bb.1: # %then ; INCDEC-NEXT: jmp external_a # TAILCALL -; INCDEC-NEXT: .LBB5_2: # %else +; INCDEC-NEXT: .LBB7_2: # %else ; INCDEC-NEXT: jmp external_b # TAILCALL ; ; ADD-LABEL: test_tail_call: @@ -123,10 +143,10 @@ ; ADD-NEXT: addb $1, a ; ADD-NEXT: sete d ; ADD-NEXT: testb %al, %al -; ADD-NEXT: jne .LBB5_2 +; ADD-NEXT: jne .LBB7_2 ; ADD-NEXT: # %bb.1: # %then ; ADD-NEXT: jmp external_a # TAILCALL -; ADD-NEXT: .LBB5_2: # %else +; ADD-NEXT: .LBB7_2: # %else ; ADD-NEXT: jmp external_b # TAILCALL entry: %val = load i32, i32* %ptr @@ -152,3 +172,19 @@ ret void } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -17,6 +17,17 @@ ret <2 x double> %add } +define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 { +; CHECK-LABEL: splat_v2f64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; CHECK-NEXT: # xmm1 = mem[0,0] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %add = fadd <2 x double> %x, + ret <2 x double> %add +} + define <4 x double> @splat_v4f64(<4 x double> %x) #1 { ; CHECK-LABEL: splat_v4f64: ; CHECK: # %bb.0: @@ -27,6 +38,16 @@ ret <4 x double> %add } +define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 { +; CHECK-LABEL: splat_v4f64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %add = fadd <4 x double> %x, + ret <4 x double> %add +} + define <4 x float> @splat_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: splat_v4f32: ; CHECK: # %bb.0: @@ -37,6 +58,16 @@ ret <4 x float> %add } +define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 { +; CHECK-LABEL: splat_v4f32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %add = fadd <4 x float> %x, + ret <4 x float> %add +} + define <8 x float> @splat_v8f32(<8 x float> %x) #1 { ; CHECK-LABEL: splat_v8f32: ; CHECK: # %bb.0: @@ -47,6 +78,16 @@ ret <8 x float> %add } +define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 { +; CHECK-LABEL: splat_v8f32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %add = fadd <8 x float> %x, + ret <8 x float> %add +} + ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 { @@ -66,6 +107,23 @@ ret <2 x i64> %add } +define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 { +; AVX-LABEL: splat_v2i64_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] +; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v2i64_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <2 x i64> %x, + ret <2 x i64> %add +} + ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, ; and then we fake it: use vmovddup to splat 64-bit value. define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { @@ -88,6 +146,26 @@ ret <4 x i64> %add } +define <4 x i64> @splat_v4i64_pgso(<4 x i64> %x) !prof !14 { +; AVX-LABEL: splat_v4i64_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2] +; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v4i64_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <4 x i64> %x, + ret <4 x i64> %add +} + ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 { ; AVX-LABEL: splat_v4i32: @@ -105,6 +183,22 @@ ret <4 x i32> %add } +define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 { +; AVX-LABEL: splat_v4i32_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v4i32_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <4 x i32> %x, + ret <4 x i32> %add +} + ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { ; AVX-LABEL: splat_v8i32: @@ -125,6 +219,25 @@ ret <8 x i32> %add } +define <8 x i32> @splat_v8i32_pgso(<8 x i32> %x) !prof !14 { +; AVX-LABEL: splat_v8i32_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v8i32_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <8 x i32> %x, + ret <8 x i32> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 { ; AVX-LABEL: splat_v8i16: @@ -141,6 +254,21 @@ ret <8 x i16> %add } +define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 { +; AVX-LABEL: splat_v8i16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v8i16_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <8 x i16> %x, + ret <8 x i16> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { ; AVX-LABEL: splat_v16i16: @@ -161,6 +289,25 @@ ret <16 x i16> %add } +define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 { +; AVX-LABEL: splat_v16i16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v16i16_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <16 x i16> %x, + ret <16 x i16> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 { ; AVX-LABEL: splat_v16i8: @@ -177,6 +324,21 @@ ret <16 x i8> %add } +define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 { +; AVX-LABEL: splat_v16i8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v16i8_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <16 x i8> %x, + ret <16 x i8> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { ; AVX-LABEL: splat_v32i8: @@ -197,6 +359,25 @@ ret <32 x i8> %add } +define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 { +; AVX-LABEL: splat_v32i8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v32i8_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <32 x i8> %x, + ret <32 x i8> %add +} + ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend' ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a ; loadi64 with multiple uses. @@ -238,3 +419,20 @@ attributes #0 = { optsize } attributes #1 = { minsize } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -822,6 +822,55 @@ ret <2 x double> %res } +define <2 x double> @test_x86_sse2_cvtss2sd_load_pgso(<2 x double> %a0, <4 x float>* %p1) !prof !14 { +; X86-SSE-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT: cvtss2sd (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x5a,0x08] +; X86-SSE-NEXT: movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1] +; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX1-NEXT: vcvtss2sd (%eax), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x08] +; X86-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] +; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X86-AVX1-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX512-NEXT: vcvtss2sd (%eax), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x08] +; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: cvtss2sd (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x5a,0x0f] +; X64-SSE-NEXT: movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1] +; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vcvtss2sd (%rdi), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x0f] +; X64-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] +; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_pgso: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vcvtss2sd (%rdi), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x0f] +; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] + %a1 = load <4 x float>, <4 x float>* %p1 + %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) { ; SSE-LABEL: test_x86_sse2_cvtdq2ps: @@ -1042,3 +1091,20 @@ ret <8 x i16> %res } declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -431,6 +431,52 @@ ret <4 x float> %tmp1 } +define <4 x float> @insertps_or_blendps_pgso(<4 x float> %t1, float %t2) nounwind !prof !14 { +; X86-SSE-LABEL: insertps_or_blendps_pgso: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04] +; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] +; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: insertps_or_blendps_pgso: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] +; X86-AVX1-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] +; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX1-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: insertps_or_blendps_pgso: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] +; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: insertps_or_blendps_pgso: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] +; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: insertps_or_blendps_pgso: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] +; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: insertps_or_blendps_pgso: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] + %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 + ret <4 x float> %tmp1 +} + ; An insert into the low 32-bits of a vector from the low 32-bits of another vector ; is always just a blendps because blendps is never more expensive than insertps. define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { @@ -2179,3 +2225,20 @@ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 ret <4 x float> %vecinit1 } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll --- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll +++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll @@ -19,6 +19,23 @@ } +define void @zero_pgso(i32* %p) !prof !14 { +; CHECK32-LABEL: zero_pgso: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl $0, (%eax) +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: zero_pgso: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: movl $0, (%rdi) +; CHECK64-NEXT: retq +entry: + store i32 0, i32* %p + ret void + +} + define void @minus_one_optsize(i32* %p) optsize { ; CHECK32-LABEL: minus_one_optsize: ; CHECK32: # %bb.0: # %entry @@ -36,6 +53,22 @@ } +define void @minus_one_pgso(i32* %p) !prof !14 { +; CHECK32-LABEL: minus_one_pgso: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl $-1, (%eax) +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: minus_one_pgso: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: movl $-1, (%rdi) +; CHECK64-NEXT: retq +entry: + store i32 -1, i32* %p + ret void + +} define void @zero_64(i64* %p) minsize { ; CHECK32-LABEL: zero_64: @@ -244,3 +277,20 @@ store volatile i16 -1, i16* %p ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/switch-density.ll b/llvm/test/CodeGen/X86/switch-density.ll --- a/llvm/test/CodeGen/X86/switch-density.ll +++ b/llvm/test/CodeGen/X86/switch-density.ll @@ -79,3 +79,72 @@ ; CHECK: ja ; CHECK: jmpq *.LJTI } + +define void @dense_optsize(i32 %x) optsize { +entry: + switch i32 %x, label %return [ + i32 12, label %bb0 + i32 4, label %bb1 + i32 16, label %bb1 + i32 20, label %bb2 + i32 8, label %bb3 + ] +bb0: tail call void @g(i32 0) br label %return +bb1: tail call void @g(i32 1) br label %return +bb2: tail call void @g(i32 1) br label %return +bb3: tail call void @g(i32 2) br label %return +return: ret void + +; Lowered as branches. +; CHECK-LABEL: dense_optsize +; CHECK: cmpl $11 +; CHECK: cmpl $20 +; CHECK: cmpl $16 +; CHECK: cmpl $12 +; CHECK: cmpl $4 +; CHECK: cmpl $8 +; CHECK: retq +} + +define void @dense_pgso(i32 %x) !prof !14 { +entry: + switch i32 %x, label %return [ + i32 12, label %bb0 + i32 4, label %bb1 + i32 16, label %bb1 + i32 20, label %bb2 + i32 8, label %bb3 + ] +bb0: tail call void @g(i32 0) br label %return +bb1: tail call void @g(i32 1) br label %return +bb2: tail call void @g(i32 1) br label %return +bb3: tail call void @g(i32 2) br label %return +return: ret void + +; Lowered as branches. +; CHECK-LABEL: dense_pgso +; CHECK: cmpl $11 +; CHECK: cmpl $20 +; CHECK: cmpl $16 +; CHECK: cmpl $12 +; CHECK: cmpl $4 +; CHECK: cmpl $8 +; CHECK: retq +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll --- a/llvm/test/CodeGen/X86/tail-opts.ll +++ b/llvm/test/CodeGen/X86/tail-opts.ll @@ -480,6 +480,47 @@ ret void } +define void @one_pgso(i32 %v) nounwind !prof !14 { +; CHECK-LABEL: one_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: je .LBB6_3 +; CHECK-NEXT: # %bb.1: # %bby +; CHECK-NEXT: cmpl $16, %edi +; CHECK-NEXT: je .LBB6_4 +; CHECK-NEXT: # %bb.2: # %bb7 +; CHECK-NEXT: jmp tail_call_me # TAILCALL +; CHECK-NEXT: .LBB6_3: # %bbx +; CHECK-NEXT: cmpl $128, %edi +; CHECK-NEXT: jne tail_call_me # TAILCALL +; CHECK-NEXT: .LBB6_4: # %return +; CHECK-NEXT: retq +entry: + %0 = icmp eq i32 %v, 0 + br i1 %0, label %bbx, label %bby + +bby: + switch i32 %v, label %bb7 [ + i32 16, label %return + ] + +bb7: + tail call void @tail_call_me() + ret void + +bbx: + switch i32 %v, label %bb12 [ + i32 128, label %return + ] + +bb12: + tail call void @tail_call_me() + ret void + +return: + ret void +} + ; two - Same as one, but with two instructions in the common ; tail instead of one. This is too much to be merged, given ; the optsize attribute. @@ -491,10 +532,51 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je .LBB6_1 +; CHECK-NEXT: je .LBB7_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB6_1: # %bb7 +; CHECK-NEXT: .LBB7_1: # %bb7 +; CHECK-NEXT: movl $0, {{.*}}(%rip) +; CHECK-NEXT: movl $1, {{.*}}(%rip) +entry: + %0 = icmp eq i32 undef, 0 + br i1 %0, label %bbx, label %bby + +bby: + switch i32 undef, label %bb7 [ + i32 16, label %return + ] + +bb7: + store volatile i32 0, i32* @XYZ + store volatile i32 1, i32* @XYZ + unreachable + +bbx: + switch i32 undef, label %bb12 [ + i32 128, label %return + ] + +bb12: + store volatile i32 0, i32* @XYZ + store volatile i32 1, i32* @XYZ + unreachable + +return: + ret void +} + +define void @two_pgso() nounwind !prof !14 { +; CHECK-LABEL: two_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB8_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_1: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: movl $1, {{.*}}(%rip) entry: @@ -534,10 +616,10 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je .LBB7_1 +; CHECK-NEXT: je .LBB9_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB7_1: # %bb7 +; CHECK-NEXT: .LBB9_1: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: movl $1, {{.*}}(%rip) entry: @@ -575,20 +657,20 @@ ; CHECK-LABEL: two_nosize: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: je .LBB8_3 +; CHECK-NEXT: je .LBB10_3 ; CHECK-NEXT: # %bb.1: # %bby ; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: je .LBB8_4 +; CHECK-NEXT: je .LBB10_4 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: jmp tail_call_me # TAILCALL -; CHECK-NEXT: .LBB8_3: # %bbx +; CHECK-NEXT: .LBB10_3: # %bbx ; CHECK-NEXT: cmpl $-1, %edx -; CHECK-NEXT: je .LBB8_4 +; CHECK-NEXT: je .LBB10_4 ; CHECK-NEXT: # %bb.5: # %bb12 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: jmp tail_call_me # TAILCALL -; CHECK-NEXT: .LBB8_4: # %return +; CHECK-NEXT: .LBB10_4: # %return ; CHECK-NEXT: retq entry: %0 = icmp eq i32 %x, 0 @@ -628,11 +710,11 @@ ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: cmovgq %rdi, %rax ; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: jle .LBB9_2 +; CHECK-NEXT: jle .LBB11_2 ; CHECK-NEXT: # %bb.1: # %bb.nph ; CHECK-NEXT: imulq %rdi, %rsi ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: .LBB9_2: # %for.end +; CHECK-NEXT: .LBB11_2: # %for.end ; CHECK-NEXT: retq entry: %cmp = icmp slt i64 %parami, 1 ; [#uses=1] @@ -661,24 +743,24 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.1: # %cont1 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.2: # %cont2 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.3: # %cont3 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.4: # %cont4 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB10_5: # %abort1 +; CHECK-NEXT: .LBB12_5: # %abort1 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq abort entry: @@ -721,27 +803,27 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_5 +; CHECK-NEXT: je .LBB13_5 ; CHECK-NEXT: # %bb.1: # %cont1 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_6 +; CHECK-NEXT: je .LBB13_6 ; CHECK-NEXT: # %bb.2: # %cont2 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_5 +; CHECK-NEXT: je .LBB13_5 ; CHECK-NEXT: # %bb.3: # %cont3 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_6 +; CHECK-NEXT: je .LBB13_6 ; CHECK-NEXT: # %bb.4: # %cont4 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB11_5: # %abort1 +; CHECK-NEXT: .LBB13_5: # %abort1 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq abort -; CHECK-NEXT: .LBB11_6: # %abort2 +; CHECK-NEXT: .LBB13_6: # %abort2 ; CHECK-NEXT: callq alt_abort entry: %c1 = call i1 @qux() @@ -770,3 +852,20 @@ cont4: ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll --- a/llvm/test/CodeGen/X86/test-vs-bittest.ll +++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll @@ -49,6 +49,30 @@ ret void } +define void @test64_pgso(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jb .LBB2_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB2_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 2048 + %s = icmp eq i64 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + ; This test is identical to test64 above with only the destination of the br ; reversed. This somehow causes the two functions to get slightly different ; initial IR. One has an extra invert of the setcc. This previous caused one @@ -60,10 +84,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB2_2 +; CHECK-NEXT: je .LBB3_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB2_2: # %no +; CHECK-NEXT: .LBB3_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -84,10 +108,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB3_2 +; CHECK-NEXT: jae .LBB4_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB3_2: # %no +; CHECK-NEXT: .LBB4_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 2048 + %s = icmp eq i64 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_2(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB5_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB5_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -108,10 +156,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jb .LBB4_2 +; CHECK-NEXT: jb .LBB6_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB4_2: # %no +; CHECK-NEXT: .LBB6_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -132,10 +180,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jb .LBB5_2 +; CHECK-NEXT: jb .LBB7_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB5_2: # %no +; CHECK-NEXT: .LBB7_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 4294967296 + %s = icmp eq i64 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_3(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btq $32, %rdi +; CHECK-NEXT: jb .LBB8_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB8_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -156,10 +228,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jae .LBB6_2 +; CHECK-NEXT: jae .LBB9_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB6_2: # %no +; CHECK-NEXT: .LBB9_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -180,10 +252,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jae .LBB7_2 +; CHECK-NEXT: jae .LBB10_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB7_2: # %no +; CHECK-NEXT: .LBB10_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 4294967296 + %s = icmp eq i64 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_4(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_4: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btq $32, %rdi +; CHECK-NEXT: jae .LBB11_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB11_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -204,10 +300,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: jne .LBB8_2 +; CHECK-NEXT: jne .LBB12_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB8_2: # %no +; CHECK-NEXT: .LBB12_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -228,10 +324,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jb .LBB9_2 +; CHECK-NEXT: jb .LBB13_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB9_2: # %no +; CHECK-NEXT: .LBB13_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -252,10 +348,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB10_2 +; CHECK-NEXT: je .LBB14_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB10_2: # %no +; CHECK-NEXT: .LBB14_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -276,10 +372,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB11_2 +; CHECK-NEXT: jae .LBB15_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB11_2: # %no +; CHECK-NEXT: .LBB15_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i32 %x, 2048 + %s = icmp eq i32 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test32_pgso_2(i32 inreg %x) !prof !14 { +; CHECK-LABEL: test32_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB16_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB16_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -300,10 +420,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: jne .LBB12_2 +; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB12_2: # %no +; CHECK-NEXT: .LBB17_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -324,10 +444,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jb .LBB13_2 +; CHECK-NEXT: jb .LBB18_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB13_2: # %no +; CHECK-NEXT: .LBB18_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i16 %x, 2048 + %s = icmp eq i16 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test16_pgso(i16 inreg %x) !prof !14 { +; CHECK-LABEL: test16_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jb .LBB19_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB19_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -348,10 +492,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB14_2 +; CHECK-NEXT: je .LBB20_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB14_2: # %no +; CHECK-NEXT: .LBB20_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -372,10 +516,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB15_2 +; CHECK-NEXT: jae .LBB21_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB15_2: # %no +; CHECK-NEXT: .LBB21_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i16 %x, 2048 + %s = icmp eq i16 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test16_pgso_2(i16 inreg %x) !prof !14 { +; CHECK-LABEL: test16_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB22_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB22_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -512,3 +680,20 @@ } declare void @bar() + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2002,6 +2002,44 @@ ret <8 x i32> %b } +define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 { +; ALL-LABEL: shuffle_v4f64_0zzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; ALL-NEXT: retq + %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> + ret <4 x double> %b +} + +define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 { +; ALL-LABEL: shuffle_v4i64_0zzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; ALL-NEXT: retq + %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> + ret <4 x i64> %b +} + +define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 { +; ALL-LABEL: shuffle_v8f32_0zzzzzzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: retq + %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> + ret <8 x float> %b +} + +define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 { +; ALL-LABEL: shuffle_v8i32_0zzzzzzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: retq + %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %b +} + define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) { ; ALL-LABEL: unpckh_v4i64: ; ALL: # %bb.0: @@ -2022,3 +2060,19 @@ ret <4 x double> %unpckh } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll --- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll +++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll @@ -240,3 +240,140 @@ %a = xor i64 %x, 9223372036854775808 ; toggle bit 63 ret i64 %a } + +define i64 @and1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $31, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 18446744071562067967 ; clear bit 31 + ret i64 %a +} + +define i64 @and2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $32, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 18446744069414584319 ; clear bit 32 + ret i64 %a +} + +define i64 @and3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $62, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 13835058055282163711 ; clear bit 62 + ret i64 %a +} + +define i64 @and4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $63, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 9223372036854775807 ; clear bit 63 + ret i64 %a +} + +define i64 @or1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $31, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 2147483648 ; set bit 31 + ret i64 %a +} + +define i64 @or2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $32, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 4294967296 ; set bit 32 + ret i64 %a +} + +define i64 @or3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $62, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 4611686018427387904 ; set bit 62 + ret i64 %a +} + +define i64 @or4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $63, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 9223372036854775808 ; set bit 63 + ret i64 %a +} + +define i64 @xor1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $31, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 2147483648 ; toggle bit 31 + ret i64 %a +} + +define i64 @xor2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $32, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 4294967296 ; toggle bit 32 + ret i64 %a +} + +define i64 @xor3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $62, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 4611686018427387904 ; toggle bit 62 + ret i64 %a +} + +define i64 @xor4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $63, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 9223372036854775808 ; toggle bit 63 + ret i64 %a +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll --- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -50,6 +50,19 @@ ret i64 %or } +define i64 @_Z8lshift11mm_pgso(i64 %a, i64 %b) !prof !14 { +; CHECK-LABEL: _Z8lshift11mm_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shldq $11, %rsi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, 11 + %shr = lshr i64 %b, 53 + %or = or i64 %shr, %shl + ret i64 %or +} + attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } ; clang -O2 -c test2.cpp -emit-llvm -S @@ -78,3 +91,19 @@ attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll --- a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll +++ b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll @@ -25,6 +25,26 @@ ret void } +define void @f_pgso(i8* %p, i8* %q, i32* inalloca nocapture %unused) !prof !14 { +entry: + %g = alloca %struct.T, align 8 + %r = alloca i32, align 8 + store i32 0, i32* %r, align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %p, i8* align 8 %q, i32 24, i1 false) + br label %while.body + +while.body: ; preds = %while.body, %entry + %load = load i32, i32* %r, align 4 + %dec = add nsw i32 %load, -1 + store i32 %dec, i32* %r, align 4 + call void @g(%struct.T* %g) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body + ret void +} + ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1 @@ -46,5 +66,38 @@ ; CHECK: testb %[[NE_REG]], %[[NE_REG]] ; CHECK: jne +; CHECK-LABEL: _f_pgso: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: andl $-8, %esp +; CHECK-NOT: movl %esp, %esi +; CHECK: rep;movsl +; CHECK: leal 8(%esp), %esi + +; CHECK: decl (%esp) +; CHECK: setne %[[NE_REG:.*]] +; CHECK: pushl %esi +; CHECK: calll _g +; CHECK: addl $4, %esp +; CHECK: testb %[[NE_REG]], %[[NE_REG]] +; CHECK: jne + attributes #0 = { nounwind optsize } attributes #1 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll --- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -152,6 +152,30 @@ br label %fallthrough } +; Negative test - opt for size +define void @test6_pgso(i1 %cond, i64* %base) !prof !14 { +; CHECK-LABEL: @test6 +entry: +; CHECK: %addr = getelementptr + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: + call void @slowpath(i32 %v1, i32* %casted) cold + br label %fallthrough +} ; Make sure sinking two copies of addressing mode into different blocks works ; when there are cold paths for each. @@ -278,3 +302,20 @@ store i1 false, i1* %G23 ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp --- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp +++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp @@ -66,7 +66,7 @@ if (!DAG) report_fatal_error("DAG?"); OptimizationRemarkEmitter ORE(F); - DAG->init(*MF, ORE, nullptr, nullptr, nullptr); + DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr); } LLVMContext Context; diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -5129,6 +5129,14 @@ SubtargetFeatureInfo::emitComputeAvailableFeatures( Target.getName(), "InstructionSelector", "computeAvailableModuleFeatures", ModuleFeatures, OS); + + if (Target.getName() == "X86") { + // TODO: Implement PGSO. + OS << "static bool shouldOptForSize(const MachineFunction *MF) {\n"; + OS << " return MF->getFunction().hasOptSize();\n"; + OS << "}\n\n"; + } + SubtargetFeatureInfo::emitComputeAvailableFeatures( Target.getName(), "InstructionSelector", "computeAvailableFunctionFeatures", FunctionFeatures, OS,