diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -283,6 +283,9 @@ void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); extern char &AMDGPUPrintfRuntimeBindingID; +void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); +extern char &AMDGPUResourceUsageAnalysisID; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -22,6 +22,7 @@ namespace llvm { class AMDGPUMachineFunction; +struct AMDGPUResourceUsageAnalysis; class AMDGPUTargetStreamer; class MCCodeEmitter; class MCOperand; @@ -39,36 +40,17 @@ class AMDGPUAsmPrinter final : public AsmPrinter { private: - // Track resource usage for callee functions. - struct SIFunctionResourceInfo { - // Track the number of explicitly used VGPRs. Special registers reserved at - // the end are tracked separately. - int32_t NumVGPR = 0; - int32_t NumAGPR = 0; - int32_t NumExplicitSGPR = 0; - uint64_t PrivateSegmentSize = 0; - bool UsesVCC = false; - bool UsesFlatScratch = false; - bool HasDynamicallySizedStack = false; - bool HasRecursion = false; - - int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; - int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; - }; - void initializeTargetID(const Module &M); - bool doInitialization(Module &M) override; + AMDGPUResourceUsageAnalysis *ResourceUsage; SIProgramInfo CurrentProgramInfo; - DenseMap CallGraphResourceInfo; std::unique_ptr HSAMetadataStream; MCCodeEmitter *DumpCodeInstEmitter = nullptr; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, @@ -103,11 +85,6 @@ explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); - // To memoize max SGPR usage of non-kernel functions of the module. - unsigned NonKernelMaxSGPRs = 0; - // To memoize max VGPR usage of non-kernel functions of the module. - unsigned NonKernelMaxVGPRs = 0; - StringRef getPassName() const override; const MCSubtargetInfo* getGlobalSTI() const; @@ -155,6 +132,8 @@ const char *ExtraCode, raw_ostream &O) override; protected: + void getAnalysisUsage(AnalysisUsage &AU) const override; + std::vector DisasmLines, HexLines; size_t DisasmLineMaxLen; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" #include "AMDGPUHSAMetadataStreamer.h" +#include "AMDGPUResourceUsageAnalysis.h" #include "AMDKernelCodeT.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" @@ -39,22 +40,6 @@ using namespace llvm; using namespace llvm::AMDGPU; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. -static cl::opt AssumedStackSizeForExternalCall( - "amdgpu-assume-external-call-stack-size", - cl::desc("Assumed stack use of any external call (in bytes)"), - cl::Hidden, - cl::init(16384)); - -static cl::opt AssumedStackSizeForDynamicSizeObjects( - "amdgpu-assume-dynamic-stack-object-size", - cl::desc("Assumed extra stack use if there are any " - "variable sized objects (in bytes)"), - cl::Hidden, - cl::init(4096)); - // This should get the default rounding mode from the kernel. We just set the // default here, but this could change if the OpenCL rounding mode pragmas are // used. @@ -345,8 +330,6 @@ } bool AMDGPUAsmPrinter::doFinalization(Module &M) { - CallGraphResourceInfo.clear(); - // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, // which is why this isn't done for Mesa. @@ -452,6 +435,7 @@ } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + ResourceUsage = &getAnalysis(); CurrentProgramInfo = SIProgramInfo(); const AMDGPUMachineFunction *MFI = MF.getInfo(); @@ -473,12 +457,6 @@ if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); - } else { - auto I = CallGraphResourceInfo.insert( - std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = I.first->second; - assert(I.second && "should only be called once per function"); - Info = analyzeResourceUsage(MF); } if (STM.isAmdPalOS()) { @@ -515,7 +493,8 @@ if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(&MF.getFunction()); emitCommonFunctionComments( Info.NumVGPR, STM.hasMAIInsts() ? Info.NumAGPR : Optional(), @@ -627,21 +606,6 @@ return false; } -bool AMDGPUAsmPrinter::doInitialization(Module &M) { - NonKernelMaxSGPRs = 0; - NonKernelMaxVGPRs = 0; - // Compute upper bound on the number of SGPRs and VGPRs - // for non-kernel functions. - for (const Function &F : M) { - if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { - const GCNSubtarget &STM = TM.getSubtarget(F); - NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, STM.getMaxNumSGPRs(F)); - NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, STM.getMaxNumVGPRs(F)); - } - } - return AsmPrinter::doInitialization(M); -} - // TODO: Fold this into emitFunctionBodyStart. void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', @@ -693,415 +657,10 @@ return CodeSize; } -static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, - const SIInstrInfo &TII, - unsigned Reg) { - for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { - if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) - return true; - } - - return false; -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( - const GCNSubtarget &ST) const { - return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs( - &ST, UsesVCC, UsesFlatScratch, ST.getTargetID().isXnackOnOrAny()); -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( - const GCNSubtarget &ST) const { - if (ST.hasGFX90AInsts() && NumAGPR) - return alignTo(NumVGPR, 4) + NumAGPR; - return std::max(NumVGPR, NumAGPR); -} - -static const Function *getCalleeFunction(const MachineOperand &Op) { - if (Op.isImm()) { - assert(Op.getImm() == 0); - return nullptr; - } - - return cast(Op.getGlobal()); -} - -AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( - const MachineFunction &MF) const { - SIFunctionResourceInfo Info; - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || - MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || - MRI.isLiveIn(MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); - - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. Inline assembly may - // need it though. - // - // If we only have implicit uses of flat_scr on flat instructions, it is not - // really needed. - if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && - (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { - Info.UsesFlatScratch = false; - } - - Info.PrivateSegmentSize = FrameInfo.getStackSize(); - - // Assume a big number if there are any unknown sized objects. - Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); - if (Info.HasDynamicallySizedStack) - Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; - - if (MFI->isStackRealigned()) - Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); - - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - // A tail call isn't considered a call for MachineFrameInfo's purposes. - if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - - if (ST.hasMAIInsts()) { - MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestAGPRReg = Reg; - break; - } - } - Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestAGPRReg) + 1; - } - - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; - - return Info; - } - - int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; - uint64_t CalleeFrameSize = 0; - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; - - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SGPR_NULL: - case AMDGPU::MODE: - continue; - - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - llvm_unreachable("src_pops_exiting_wave_id should not be used"); - - case AMDGPU::NoRegister: - assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); - continue; - - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - - case AMDGPU::XNACK_MASK: - case AMDGPU::XNACK_MASK_LO: - case AMDGPU::XNACK_MASK_HI: - llvm_unreachable("xnack_mask registers should not be used"); - - case AMDGPU::LDS_DIRECT: - llvm_unreachable("lds_direct register should not be used"); - - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - case AMDGPU::SRC_VCCZ: - llvm_unreachable("src_vccz register should not be used"); - - case AMDGPU::SRC_EXECZ: - llvm_unreachable("src_execz register should not be used"); - - case AMDGPU::SRC_SCC: - llvm_unreachable("src_scc register should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(Reg) || - AMDGPU::SReg_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_LO16RegClass.contains(Reg) || - AMDGPU::VGPR_HI16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned HWReg = TRI.getHWRegIndex(Reg); - int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } - } - - if (MI.isCall()) { - // Pseudo used just to encode the underlying global. Is there a better - // way to track this? - - const MachineOperand *CalleeOp - = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - - const Function *Callee = getCalleeFunction(*CalleeOp); - DenseMap::const_iterator I = - CallGraphResourceInfo.end(); - bool IsExternal = !Callee || Callee->isDeclaration(); - if (!IsExternal) - I = CallGraphResourceInfo.find(Callee); - - if (IsExternal || I == CallGraphResourceInfo.end()) { - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - // FIXME: The verifier shouldn't allow this. - if (!IsExternal && - AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - - unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - TM.getMCSubtargetInfo(), false, ST.hasFlatAddressSpace()); - // If this is a call to an external function, we put the - // max values computed in doInitialization(). - // Subtract extra SGPRs in case of indirect calls. - // For indirect calls, we take the max for the module - // and use that as the register budget for functions - // which makes an indirect calls. This max value - // includes extra SGPRs too (e.g. flatscratch and vcc). - // which are getting added later. - // Subtract them here so that they don't get added twice. - MaxSGPR = NonKernelMaxSGPRs - ExtraSGPRs - 1; - MaxVGPR = NonKernelMaxVGPRs - 1; - // TODO: handle AGPRs - MaxAGPR = std::max(MaxAGPR, 23); - - CalleeFrameSize = std::max(CalleeFrameSize, - static_cast(AssumedStackSizeForExternalCall)); - - Info.UsesVCC = true; - Info.UsesFlatScratch = ST.hasFlatAddressSpace(); - Info.HasDynamicallySizedStack = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); - CalleeFrameSize - = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - } - - // FIXME: Call site could have norecurse on it - if (!Callee || !Callee->doesNotRecurse()) - Info.HasRecursion = true; - } - } - } - - Info.NumExplicitSGPR = MaxSGPR + 1; - Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; - Info.PrivateSegmentSize += CalleeFrameSize; - - return Info; -} - void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { - SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(&MF.getFunction()); const GCNSubtarget &STM = MF.getSubtarget(); ProgInfo.NumArchVGPR = Info.NumVGPR; @@ -1522,3 +1081,9 @@ } return true; } + +void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); + AsmPrinter::getAnalysisUsage(AU); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -0,0 +1,71 @@ +//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes how many registers and other resources are used by +/// functions. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H + +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/ValueMap.h" + +namespace llvm { + +class GCNSubtarget; +class MachineFunction; +class TargetMachine; + +struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass { + static char ID; + +public: + // Track resource usage for callee functions. + struct SIFunctionResourceInfo { + // Track the number of explicitly used VGPRs. Special registers reserved at + // the end are tracked separately. + int32_t NumVGPR = 0; + int32_t NumAGPR = 0; + int32_t NumExplicitSGPR = 0; + uint64_t PrivateSegmentSize = 0; + bool UsesVCC = false; + bool UsesFlatScratch = false; + bool HasDynamicallySizedStack = false; + bool HasRecursion = false; + bool HasIndirectCall = false; + + int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; + }; + + AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {} + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } + + const SIFunctionResourceInfo &getResourceInfo(const Function *F) const { + return CallGraphResourceInfo.find(F)->getSecond(); + } + +private: + SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF, + const TargetMachine &TM) const; + void propagateIndirectCallRegisterUsage(); + + DenseMap CallGraphResourceInfo; +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -0,0 +1,514 @@ +//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes how many registers and other resources are used by +/// functions. +/// +/// The results of this analysis are used to fill the register usage, flat +/// usage, etc. into hardware registers. +/// +/// The analysis takes callees into account. E.g. if a function A that needs 10 +/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A +/// will return 20. +/// It is assumed that an indirect call can go into any function except +/// hardware-entrypoints. Therefore the register usage of functions with +/// indirect calls is estimated as the maximum of all non-entrypoint functions +/// in the module. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUResourceUsageAnalysis.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +#define DEBUG_TYPE "amdgpu-resource-usage" + +char llvm::AMDGPUResourceUsageAnalysis::ID = 0; +char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; + +// We need to tell the runtime some amount ahead of time if we don't know the +// true stack size. Assume a smaller number if this is only due to dynamic / +// non-entry block allocas. +static cl::opt AssumedStackSizeForExternalCall( + "amdgpu-assume-external-call-stack-size", + cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, + cl::init(16384)); + +static cl::opt AssumedStackSizeForDynamicSizeObjects( + "amdgpu-assume-dynamic-stack-object-size", + cl::desc("Assumed extra stack use if there are any " + "variable sized objects (in bytes)"), + cl::Hidden, cl::init(4096)); + +INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, + "Function register usage analysis", true, true) + +static const Function *getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + + return cast(Op.getGlobal()); +} + +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } + + return false; +} + +int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( + const GCNSubtarget &ST) const { + return NumExplicitSGPR + + IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, + ST.getTargetID().isXnackOnOrAny()); +} + +int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST) const { + if (ST.hasGFX90AInsts() && NumAGPR) + return alignTo(NumVGPR, 4) + NumAGPR; + return std::max(NumVGPR, NumAGPR); +} + +bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM(); + bool HasIndirectCall = false; + + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; + + MachineModuleInfo &MMI = + getAnalysis().getMMI(); + MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + + auto CI = CallGraphResourceInfo.insert( + std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); + SIFunctionResourceInfo &Info = CI.first->second; + assert(CI.second && "should only be called once per function"); + Info = analyzeResourceUsage(MF, TM); + HasIndirectCall |= Info.HasIndirectCall; + } + + if (HasIndirectCall) + propagateIndirectCallRegisterUsage(); + + return false; +} + +AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo +AMDGPUResourceUsageAnalysis::analyzeResourceUsage( + const MachineFunction &MF, const TargetMachine &TM) const { + SIFunctionResourceInfo Info; + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || + MRI.isLiveIn(MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly may + // need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + Info.UsesFlatScratch = false; + } + + Info.PrivateSegmentSize = FrameInfo.getStackSize(); + + // Assume a big number if there are any unknown sized objects. + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + if (Info.HasDynamicallySizedStack) + Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; + + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); + + Info.UsesVCC = + MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + // A tail call isn't considered a call for MachineFrameInfo's purposes. + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } + + if (ST.hasMAIInsts()) { + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } + } + Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestAGPRReg) + 1; + } + + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get + // the number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; + } + + int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; + int32_t MaxSGPR = -1; + uint64_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + bool IsAGPR = false; + + if (!MO.isReg()) + continue; + + Register Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::M0_LO16: + case AMDGPU::M0_HI16: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: + case AMDGPU::MODE: + continue; + + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + + case AMDGPU::NoRegister: + assert(MI.isDebugInstr() && + "Instruction uses invalid noreg register"); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + case AMDGPU::VCC_LO_LO16: + case AMDGPU::VCC_LO_HI16: + case AMDGPU::VCC_HI_LO16: + case AMDGPU::VCC_HI_HI16: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::XNACK_MASK: + case AMDGPU::XNACK_MASK_LO: + case AMDGPU::XNACK_MASK_HI: + llvm_unreachable("xnack_mask registers should not be used"); + + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_LO16RegClass.contains(Reg) || + AMDGPU::SGPR_HI16RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || + AMDGPU::VGPR_LO16RegClass.contains(Reg) || + AMDGPU::VGPR_HI16RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || + AMDGPU::AGPR_LO16RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + IsSGPR = true; + Width = 3; + } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 4; + } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { + IsSGPR = false; + Width = 5; + } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { + IsSGPR = true; + Width = 5; + } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 5; + } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { + IsSGPR = false; + Width = 6; + } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { + IsSGPR = true; + Width = 6; + } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 6; + } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { + IsSGPR = false; + Width = 7; + } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { + IsSGPR = true; + Width = 7; + } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 7; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 32; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp = + TII->getNamedOperand(MI, AMDGPU::OpName::callee); + + const Function *Callee = getCalleeFunction(*CalleeOp); + DenseMap::const_iterator I = + CallGraphResourceInfo.end(); + + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + bool IsIndirect = !Callee || Callee->isDeclaration(); + if (!IsIndirect) + I = CallGraphResourceInfo.find(Callee); + + if (IsIndirect || I == CallGraphResourceInfo.end()) { + CalleeFrameSize = + std::max(CalleeFrameSize, + static_cast(AssumedStackSizeForExternalCall)); + + // Register usage of indirect calls gets handled later + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + Info.HasIndirectCall = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); + CalleeFrameSize = + std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + Info.HasIndirectCall |= I->second.HasIndirectCall; + } + + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) + Info.HasRecursion = true; + } + } + } + + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.NumAGPR = MaxAGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; + + return Info; +} + +void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { + // Collect the maximum number of registers from non-hardware-entrypoints. + // All these functions are potential targets for indirect calls. + int32_t NonKernelMaxSGPRs = 0; + int32_t NonKernelMaxVGPRs = 0; + int32_t NonKernelMaxAGPRs = 0; + + for (const auto &I : CallGraphResourceInfo) { + if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { + auto &Info = I.getSecond(); + NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); + NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); + NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); + } + } + + // Add register usage for functions with indirect calls. + // For calls to unknown functions, we assume the maximum register usage of + // all non-hardware-entrypoints in the current module. + for (auto &I : CallGraphResourceInfo) { + auto &Info = I.getSecond(); + if (Info.HasIndirectCall) { + Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); + Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); + Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); + } + } +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -390,6 +390,7 @@ initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); + initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -92,6 +92,7 @@ AMDGPUPerfHintAnalysis.cpp AMDILCFGStructurizer.cpp AMDGPUPrintfRuntimeBinding.cpp + AMDGPUResourceUsageAnalysis.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -154,22 +154,22 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GFX908: .amdhsa_next_free_vgpr 128 -; GFX90A: .amdhsa_next_free_vgpr 280 -; GFX90A: .amdhsa_accum_offset 256 +; GFX908: .amdhsa_next_free_vgpr 32 +; GFX90A: .amdhsa_next_free_vgpr 64 +; GFX90A: .amdhsa_accum_offset 32 ; GCN908: NumVgprs: 128 ; GCN90A: NumVgprs: 256 -; GCN: NumAgprs: 24 -; GFX908: TotalNumVgprs: 128 -; GFX90A: TotalNumVgprs: 280 -; GFX908: VGPRBlocks: 31 -; GFX90A: VGPRBlocks: 34 -; GFX908: NumVGPRsForWavesPerEU: 128 -; GFX90A: NumVGPRsForWavesPerEU: 280 -; GFX90A: AccumOffset: 256 -; GFX908: Occupancy: 2 -; GFX90A: Occupancy: 1 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63 +; GCN: NumAgprs: 32 +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 64 +; GFX908: VGPRBlocks: 7 +; GFX90A: VGPRBlocks: 7 +; GFX908: NumVGPRsForWavesPerEU: 32 +; GFX90A: NumVGPRsForWavesPerEU: 64 +; GFX90A: AccumOffset: 32 +; GFX908: Occupancy: 8 +; GFX90A: Occupancy: 8 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -147,7 +147,8 @@ ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}} +; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} +; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} ; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -175,27 +176,29 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: no_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x20{{$}} +; GCN-NEXT: .sgpr_count: 0x21{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} -; GCN-NEXT: .vgpr_count: 0x1{{$}} +; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: no_stack_extern_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GFX8-NEXT: .sgpr_count: 0x24{{$}} +; GFX9-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; GCN-NEXT: .vgpr_count: 0x29{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GFX8-NEXT: .sgpr_count: 0x24{{$}} +; GFX9-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; SDAG-NEXT: .vgpr_count: 0x2a{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GFX8-NEXT: .sgpr_count: 0x24{{$}} +; GFX9-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; SDAG-NEXT: .vgpr_count: 0x2a{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -203,10 +206,9 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: simple_lds_recurse: ; GCN-NEXT: .lds_size: 0x100{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GCN-NEXT: .sgpr_count: 0x24{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; GCN-NEXT: .vgpr_count: 0x29{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x21{{$}} @@ -219,20 +221,20 @@ ; GCN-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: simple_stack_extern_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GFX8-NEXT: .sgpr_count: 0x24{{$}} +; GFX9-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GFX8-NEXT: .sgpr_count: 0x24{{$}} +; GFX9-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; SDAG-NEXT: .vgpr_count: 0x2b{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x68{{$}} -; GFX9-NEXT: .sgpr_count: 0x66{{$}} +; GCN-NEXT: .sgpr_count: 0x24{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x40{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -227,10 +227,10 @@ ; Make sure there's no assert when a sgpr96 is used. ; GCN-LABEL: {{^}}count_use_sgpr96_external_call ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 102 -; VI-NOBUG: NumSgprs: 102 +; CI: NumSgprs: 84 +; VI-NOBUG: NumSgprs: 86 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 64 +; GCN: NumVgprs: 50 define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -241,10 +241,10 @@ ; Make sure there's no assert when a sgpr160 is used. ; GCN-LABEL: {{^}}count_use_sgpr160_external_call ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 102 -; VI-NOBUG: NumSgprs: 102 +; CI: NumSgprs: 84 +; VI-NOBUG: NumSgprs: 86 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 64 +; GCN: NumVgprs: 50 define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -255,10 +255,10 @@ ; Make sure there's no assert when a vgpr160 is used. ; GCN-LABEL: {{^}}count_use_vgpr160_external_call ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 102 -; VI-NOBUG: NumSgprs: 102 +; CI: NumSgprs: 84 +; VI-NOBUG: NumSgprs: 86 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 64 +; GCN: NumVgprs: 50 define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -15,8 +15,8 @@ ; GCN-NEXT: amd_machine_version_stepping = 0 ; GCN-NEXT: kernel_code_entry_byte_offset = 256 ; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 15 -; GCN-NEXT: granulated_wavefront_sgpr_count = 12 +; GCN-NEXT: granulated_workitem_vgpr_count = 7 +; GCN-NEXT: granulated_wavefront_sgpr_count = 4 ; GCN-NEXT: priority = 0 ; GCN-NEXT: float_mode = 240 ; GCN-NEXT: priv = 0 @@ -59,8 +59,8 @@ ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 0 ; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 102 -; GCN-NEXT: workitem_vgpr_count = 64 +; GCN-NEXT: wavefront_sgpr_count = 37 +; GCN-NEXT: workitem_vgpr_count = 32 ; GCN-NEXT: reserved_vgpr_first = 0 ; GCN-NEXT: reserved_vgpr_count = 0 ; GCN-NEXT: reserved_sgpr_first = 0 @@ -111,8 +111,8 @@ ; GCN-NEXT: amd_machine_version_stepping = 0 ; GCN-NEXT: kernel_code_entry_byte_offset = 256 ; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 15 -; GCN-NEXT: granulated_wavefront_sgpr_count = 12 +; GCN-NEXT: granulated_workitem_vgpr_count = 7 +; GCN-NEXT: granulated_wavefront_sgpr_count = 4 ; GCN-NEXT: priority = 0 ; GCN-NEXT: float_mode = 240 ; GCN-NEXT: priv = 0 @@ -155,8 +155,8 @@ ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 0 ; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 102 -; GCN-NEXT: workitem_vgpr_count = 64 +; GCN-NEXT: wavefront_sgpr_count = 37 +; GCN-NEXT: workitem_vgpr_count = 32 ; GCN-NEXT: reserved_vgpr_first = 0 ; GCN-NEXT: reserved_vgpr_count = 0 ; GCN-NEXT: reserved_sgpr_first = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -136,6 +136,8 @@ ; GCN-O0-NEXT: Branch relaxation pass ; GCN-O0-NEXT: Register Usage Information Collector Pass ; GCN-O0-NEXT: Live DEBUG_VALUE analysis +; GCN-O0-NEXT: Function register usage analysis +; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: AMDGPU Assembly Printer @@ -384,6 +386,8 @@ ; GCN-O1-NEXT: Branch relaxation pass ; GCN-O1-NEXT: Register Usage Information Collector Pass ; GCN-O1-NEXT: Live DEBUG_VALUE analysis +; GCN-O1-NEXT: Function register usage analysis +; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: AMDGPU Assembly Printer @@ -665,6 +669,8 @@ ; GCN-O1-OPTS-NEXT: Branch relaxation pass ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis +; GCN-O1-OPTS-NEXT: Function register usage analysis +; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer @@ -949,6 +955,8 @@ ; GCN-O2-NEXT: Branch relaxation pass ; GCN-O2-NEXT: Register Usage Information Collector Pass ; GCN-O2-NEXT: Live DEBUG_VALUE analysis +; GCN-O2-NEXT: Function register usage analysis +; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: AMDGPU Assembly Printer @@ -1246,6 +1254,8 @@ ; GCN-O3-NEXT: Branch relaxation pass ; GCN-O3-NEXT: Register Usage Information Collector Pass ; GCN-O3-NEXT: Live DEBUG_VALUE analysis +; GCN-O3-NEXT: Function register usage analysis +; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: AMDGPU Assembly Printer