diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,8 +15,9 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #include "AMDGPU.h" -#include "AMDKernelCodeT.h" #include "AMDGPUHSAMetadataStreamer.h" +#include "AMDKernelCodeT.h" +#include "SIFunctionResourceInfoTracker.h" #include "SIProgramInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -38,32 +39,14 @@ class AMDGPUAsmPrinter final : public AsmPrinter { private: - // Track resource usage for callee functions. - struct SIFunctionResourceInfo { - // Track the number of explicitly used VGPRs. Special registers reserved at - // the end are tracked separately. - int32_t NumVGPR = 0; - int32_t NumAGPR = 0; - int32_t NumExplicitSGPR = 0; - uint64_t PrivateSegmentSize = 0; - bool UsesVCC = false; - bool UsesFlatScratch = false; - bool HasDynamicallySizedStack = false; - bool HasRecursion = false; - - int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; - int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; - }; - SIProgramInfo CurrentProgramInfo; - DenseMap CallGraphResourceInfo; std::unique_ptr HSAMetadataStream; MCCodeEmitter *DumpCodeInstEmitter = nullptr; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; + std::unique_ptr SIFuncResourceInfo; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, @@ -94,6 +77,7 @@ const SIProgramInfo &PI) const; public: + std::unique_ptr ResourceTracker; explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -49,22 +49,6 @@ using namespace llvm::AMDGPU; using namespace llvm::AMDGPU::HSAMD; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. -static cl::opt AssumedStackSizeForExternalCall( - "amdgpu-assume-external-call-stack-size", - cl::desc("Assumed stack use of any external call (in bytes)"), - cl::Hidden, - cl::init(16384)); - -static cl::opt AssumedStackSizeForDynamicSizeObjects( - "amdgpu-assume-dynamic-stack-object-size", - cl::desc("Assumed extra stack use if there are any " - "variable sized objects (in bytes)"), - cl::Hidden, - cl::init(4096)); - // This should get the default rounding mode from the kernel. We just set the // default here, but this could change if the OpenCL rounding mode pragmas are // used. @@ -108,10 +92,11 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)) { - if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) - HSAMetadataStream.reset(new MetadataStreamerV3()); - else - HSAMetadataStream.reset(new MetadataStreamerV2()); + ResourceTracker = std::make_unique(); + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) + HSAMetadataStream.reset(new MetadataStreamerV3()); + else + HSAMetadataStream.reset(new MetadataStreamerV2()); } StringRef AMDGPUAsmPrinter::getPassName() const { @@ -330,7 +315,7 @@ } bool AMDGPUAsmPrinter::doFinalization(Module &M) { - CallGraphResourceInfo.clear(); + ResourceTracker->CallGraphResourceMap.clear(); // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, @@ -447,11 +432,20 @@ if (MFI->isEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } else { - auto I = CallGraphResourceInfo.insert( - std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = I.first->second; + // TODO: We add an empty resource info for the + // function, take a pointer to it and then update it + // based on what analyzeResourceInfo returns. + // This is hacky. Ideally, tracker should update it internally + // based on a flag. This also needs to be carefully taken + // care by in case of recursion becuase we don't want to + // add calle's resource usage to caller since caller + // is callee in case of recursion! + auto I = ResourceTracker->CallGraphResourceMap.insert( + std::make_pair(&MF.getFunction(), + std::move(std::make_unique()))); + auto &Info = I.first->second; assert(I.second && "should only be called once per function"); - Info = analyzeResourceUsage(MF); + Info = std::move(ResourceTracker->analyzeResourceUsage(MF)); } if (STM.isAmdPalOS()) @@ -485,14 +479,13 @@ if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; + auto &Info = ResourceTracker->CallGraphResourceMap[&MF.getFunction()]; emitCommonFunctionComments( - Info.NumVGPR, - STM.hasMAIInsts() ? Info.NumAGPR : Optional(), - Info.getTotalNumVGPRs(STM), - Info.getTotalNumSGPRs(MF.getSubtarget()), - Info.PrivateSegmentSize, - getFunctionCodeSize(MF), MFI); + Info->getNumVGPR(), + STM.hasMAIInsts() ? Info->getNumAGPR() : Optional(), + Info->getTotalNumVGPRs(STM), + Info->getTotalNumSGPRs(MF.getSubtarget()), + Info->getPrivateSegmentSize(), getFunctionCodeSize(MF), MFI); return false; } @@ -595,401 +588,20 @@ return CodeSize; } -static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, - const SIInstrInfo &TII, - unsigned Reg) { - for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { - if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) - return true; - } - - return false; -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( - const GCNSubtarget &ST) const { - return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST, - UsesVCC, UsesFlatScratch); -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( - const GCNSubtarget &ST) const { - return std::max(NumVGPR, NumAGPR); -} - -static const Function *getCalleeFunction(const MachineOperand &Op) { - if (Op.isImm()) { - assert(Op.getImm() == 0); - return nullptr; - } - - return cast(Op.getGlobal()); -} - -AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( - const MachineFunction &MF) const { - SIFunctionResourceInfo Info; - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || - MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); - - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. Inline assembly may - // need it though. - // - // If we only have implicit uses of flat_scr on flat instructions, it is not - // really needed. - if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && - (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { - Info.UsesFlatScratch = false; - } - - Info.PrivateSegmentSize = FrameInfo.getStackSize(); - - // Assume a big number if there are any unknown sized objects. - Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); - if (Info.HasDynamicallySizedStack) - Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; - - if (MFI->isStackRealigned()) - Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); - - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - // A tail call isn't considered a call for MachineFrameInfo's purposes. - if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - - if (ST.hasMAIInsts()) { - MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestAGPRReg = Reg; - break; - } - } - Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestAGPRReg) + 1; - } - - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; - - return Info; - } - - int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; - uint64_t CalleeFrameSize = 0; - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; - - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SGPR_NULL: - case AMDGPU::MODE: - continue; - - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - llvm_unreachable("src_pops_exiting_wave_id should not be used"); - - case AMDGPU::NoRegister: - assert(MI.isDebugInstr()); - continue; - - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - - case AMDGPU::XNACK_MASK: - case AMDGPU::XNACK_MASK_LO: - case AMDGPU::XNACK_MASK_HI: - llvm_unreachable("xnack_mask registers should not be used"); - - case AMDGPU::LDS_DIRECT: - llvm_unreachable("lds_direct register should not be used"); - - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - case AMDGPU::SRC_VCCZ: - llvm_unreachable("src_vccz register should not be used"); - - case AMDGPU::SRC_EXECZ: - llvm_unreachable("src_execz register should not be used"); - - case AMDGPU::SRC_SCC: - llvm_unreachable("src_scc register should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(Reg) || - AMDGPU::SReg_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_LO16RegClass.contains(Reg) || - AMDGPU::VGPR_HI16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned HWReg = TRI.getHWRegIndex(Reg); - int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } - } - - if (MI.isCall()) { - // Pseudo used just to encode the underlying global. Is there a better - // way to track this? - - const MachineOperand *CalleeOp - = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - - const Function *Callee = getCalleeFunction(*CalleeOp); - if (!Callee || Callee->isDeclaration()) { - // If this is a call to an external function, we can't do much. Make - // conservative guesses. - - // 48 SGPRs - vcc, - flat_scr, -xnack - int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); - MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); - MaxVGPR = std::max(MaxVGPR, 23); - MaxAGPR = std::max(MaxAGPR, 23); - - CalleeFrameSize = std::max(CalleeFrameSize, - static_cast(AssumedStackSizeForExternalCall)); - - Info.UsesVCC = true; - Info.UsesFlatScratch = ST.hasFlatAddressSpace(); - Info.HasDynamicallySizedStack = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - - auto I = CallGraphResourceInfo.find(Callee); - if (I == CallGraphResourceInfo.end()) { - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - // FIXME: The verifier shouldn't allow this. - if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - - llvm_unreachable("callee should have been handled before caller"); - } - - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); - CalleeFrameSize - = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - } - - // FIXME: Call site could have norecurse on it - if (!Callee || !Callee->doesNotRecurse()) - Info.HasRecursion = true; - } - } - } - - Info.NumExplicitSGPR = MaxSGPR + 1; - Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; - Info.PrivateSegmentSize += CalleeFrameSize; - - return Info; -} - void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { - SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + auto Info = ResourceTracker->analyzeResourceUsage(MF); const GCNSubtarget &STM = MF.getSubtarget(); - ProgInfo.NumArchVGPR = Info.NumVGPR; - ProgInfo.NumAccVGPR = Info.NumAGPR; - ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); - ProgInfo.NumSGPR = Info.NumExplicitSGPR; - ProgInfo.ScratchSize = Info.PrivateSegmentSize; - ProgInfo.VCCUsed = Info.UsesVCC; - ProgInfo.FlatUsed = Info.UsesFlatScratch; - ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + ProgInfo.NumArchVGPR = Info->getNumVGPR(); + ProgInfo.NumAccVGPR = Info->getNumAGPR(); + ProgInfo.NumVGPR = Info->getTotalNumVGPRs(STM); + ProgInfo.NumSGPR = Info->getNumExplicitSGPR(); + ProgInfo.ScratchSize = Info->getPrivateSegmentSize(); + ProgInfo.VCCUsed = Info->usesVCC(); + ProgInfo.FlatUsed = Info->usesFlatScratch(); + ProgInfo.DynamicCallStack = + Info->hasDynamicallySizedStack() || Info->hasRecursion(); if (!isUInt<32>(ProgInfo.ScratchSize)) { DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -115,6 +115,7 @@ SIInsertWaitcnts.cpp SIInstrInfo.cpp SIISelLowering.cpp + SIFunctionResourceInfoTracker.cpp SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.h b/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.h @@ -0,0 +1,119 @@ +//===-- AMDGPUSIFunctionResourceTracker.h - Track resource usage for functions +//---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// AMDGPU function resource info tracker. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIFUNCTIONRESOURCETRACKER_H +#define LLVM_LIB_TARGET_AMDGPU_SIFUNCTIONRESOURCETRACKER_H + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include +#include +#include +#include +#include +#include + +// Track resource usage for callee functions. +// Track the number of explicitly used VGPRs. Special registers reserved at +// the end are tracked separately. +namespace llvm { + +class AMDGPUMachineFunction; +class GCNSubtarget; + +class SIFunctionResourceInfo { +private: + int32_t NumVGPR; + int32_t NumAGPR; + int32_t NumExplicitSGPR; + uint64_t PrivateSegmentSize; + bool UsesVCC; + bool UsesFlatScratch; + bool HasDynamicallySizedStack; + bool HasRecursion; + StringRef DebugName; + +public: + SIFunctionResourceInfo() + : NumVGPR(0), NumAGPR(0), NumExplicitSGPR(0), PrivateSegmentSize(0), + UsesVCC(false), UsesFlatScratch(false), HasDynamicallySizedStack(false), + HasRecursion(false), DebugName("") {} + + void setName(StringRef Name) { DebugName = Name; } + StringRef getName() { return DebugName; } + + void setNumVGPR(int32_t Num) { NumVGPR = Num; } + int32_t getNumVGPR() { return NumVGPR; } + + void setNumAGPR(int32_t Num) { NumAGPR = Num; } + int32_t getNumAGPR() { return NumAGPR; } + + void setNumExplicitSGPR(int32_t SGPR) { NumExplicitSGPR = SGPR; } + int32_t getNumExplicitSGPR() { return NumExplicitSGPR; } + + void setPrivateSegmentSize(uint64_t SegmentSize) { + PrivateSegmentSize = SegmentSize; + } + uint64_t getPrivateSegmentSize() { return PrivateSegmentSize; } + + void setVCCUsage(bool V) { UsesVCC = V; } + bool usesVCC() { return UsesVCC; } + + void setFlatScratchUsage(bool F) { UsesFlatScratch = F; } + bool usesFlatScratch() { return UsesFlatScratch; } + + void setDynamicallySizedStackUsage(bool Use) { + HasDynamicallySizedStack = Use; + } + bool hasDynamicallySizedStack() { return HasDynamicallySizedStack; } + + void setRecursionUsage(bool R) { HasRecursion = R; } + bool hasRecursion() { return HasRecursion; } + + int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; +}; + +class SIFunctionResourceInfoTracker { +public: + DenseMap> + CallGraphResourceMap; + const Function *getCalleeFunction(const MachineOperand &Op); + void + gatherResourceInfoFromCallGraph(std::unique_ptr &Info, + const MachineFunction &MF); + + int32_t getLeafFunctionVGPRsUsed(const MachineFunction &MF); + int32_t getLeafFunctionAGPRsUsed(const MachineFunction &MF); + int32_t getLeafFunctionSGPRsUsed(const MachineFunction &MF); + + std::unique_ptr + analyzeResourceUsage(const MachineFunction &MF); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H diff --git a/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.cpp b/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIFunctionResourceInfoTracker.cpp @@ -0,0 +1,470 @@ +//===-- AMDGPUFunctionResourceInfoTracker.cpp - AMDGPU assembly printer +//--------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Implementation of function resource tracker. +/// +//===----------------------------------------------------------------------===// +// + +#include "SIFunctionResourceInfoTracker.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +// We need to tell the runtime some amount ahead of time if we don't know the +// true stack size. Assume a smaller number if this is only due to dynamic / +// non-entry block allocas. +cl::opt AssumedStackSizeForExternalCall( + "amdgpu-assume-external-call-stack-size", + cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, + cl::init(16384)); + +cl::opt AssumedStackSizeForDynamicSizeObjects( + "amdgpu-assume-dynamic-stack-object-size", + cl::desc("Assumed extra stack use if there are any " + "variable sized objects (in bytes)"), + cl::Hidden, cl::init(4096)); + +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } + + return false; +} + +int32_t SIFunctionResourceInfo::getTotalNumSGPRs(const GCNSubtarget &ST) const { + return NumExplicitSGPR + + IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch); +} + +int32_t SIFunctionResourceInfo::getTotalNumVGPRs(const GCNSubtarget &ST) const { + return std::max(NumVGPR, NumAGPR); +} + +int32_t SIFunctionResourceInfoTracker::getLeafFunctionVGPRsUsed( + const MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + assert(!FrameInfo.hasCalls() && !FrameInfo.hasTailCall() && + "Should be called for leaf functions only."); + + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } + return HighestVGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestVGPRReg) + 1; +} + +int32_t SIFunctionResourceInfoTracker::getLeafFunctionAGPRsUsed( + const MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + assert(!FrameInfo.hasCalls() && !FrameInfo.hasTailCall() && + "Should be called for leaf functions only."); + + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } + } + return HighestAGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestAGPRReg) + 1; +} + +int32_t SIFunctionResourceInfoTracker::getLeafFunctionSGPRsUsed( + const MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + assert(!FrameInfo.hasCalls() && !FrameInfo.hasTailCall() && + "Should be called for leaf functions only."); + + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + return HighestSGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestSGPRReg) + 1; +} + +const Function * +SIFunctionResourceInfoTracker::getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + return cast(Op.getGlobal()); +} + +void SIFunctionResourceInfoTracker::gatherResourceInfoFromCallGraph( + std::unique_ptr &Info, const MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; + int32_t MaxSGPR = -1; + uint64_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + bool IsAGPR = false; + + if (!MO.isReg()) + continue; + + Register Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: + case AMDGPU::MODE: + continue; + + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + + case AMDGPU::NoRegister: + assert(MI.isDebugInstr()); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + case AMDGPU::VCC_LO_LO16: + case AMDGPU::VCC_LO_HI16: + case AMDGPU::VCC_HI_LO16: + case AMDGPU::VCC_HI_HI16: + Info->setVCCUsage(true); + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::XNACK_MASK: + case AMDGPU::XNACK_MASK_LO: + case AMDGPU::XNACK_MASK_HI: + llvm_unreachable("xnack_mask registers should not be used"); + + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_LO16RegClass.contains(Reg) || + AMDGPU::SGPR_HI16RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || + AMDGPU::VGPR_LO16RegClass.contains(Reg) || + AMDGPU::VGPR_HI16RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || + AMDGPU::AGPR_LO16RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + IsSGPR = true; + Width = 3; + } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 4; + } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { + IsSGPR = false; + Width = 5; + } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { + IsSGPR = true; + Width = 5; + } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 5; + } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { + IsSGPR = false; + Width = 6; + } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { + IsSGPR = true; + Width = 6; + } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 6; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 32; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } // end of operand loop + + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp = + TII->getNamedOperand(MI, AMDGPU::OpName::callee); + + const Function *Callee = getCalleeFunction(*CalleeOp); + if (!Callee || Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - IsaInfo::getNumExtraSGPRs( + &ST, true, ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + MaxAGPR = std::max(MaxAGPR, 23); + + CalleeFrameSize = + std::max(CalleeFrameSize, + static_cast(AssumedStackSizeForExternalCall)); + + Info->setVCCUsage(true); + Info->setFlatScratchUsage(ST.hasFlatAddressSpace()); + Info->setDynamicallySizedStackUsage(true); + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + + auto I = CallGraphResourceMap.find(Callee); + if (I == CallGraphResourceMap.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + llvm_unreachable("callee should have been handled before caller"); + } + + assert(I->second && "Null resource info"); + MaxSGPR = std::max(I->second->getNumExplicitSGPR() - 1, MaxSGPR); + MaxVGPR = std::max(I->second->getNumVGPR() - 1, MaxVGPR); + MaxAGPR = std::max(I->second->getNumAGPR() - 1, MaxAGPR); + CalleeFrameSize = + std::max(I->second->getPrivateSegmentSize(), CalleeFrameSize); + Info->setVCCUsage(Info->usesVCC() | I->second->usesVCC()); + Info->setFlatScratchUsage(Info->usesFlatScratch() | + I->second->usesFlatScratch()); + Info->setDynamicallySizedStackUsage( + Info->hasDynamicallySizedStack() | + I->second->hasDynamicallySizedStack()); + Info->setRecursionUsage(Info->hasRecursion() | + I->second->hasRecursion()); + } + + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) + Info->setRecursionUsage(true); + } + } // end of instruction loop + } // end of BB loop + + Info->setNumExplicitSGPR(MaxSGPR + 1); + Info->setNumVGPR(MaxVGPR + 1); + Info->setNumAGPR(MaxAGPR + 1); + Info->setPrivateSegmentSize(Info->getPrivateSegmentSize() + CalleeFrameSize); +} + +std::unique_ptr +SIFunctionResourceInfoTracker::analyzeResourceUsage(const MachineFunction &MF) { + + auto Info = std::make_unique(); + Info->setName(MF.getName()); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + Info->setFlatScratchUsage(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI)); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly may + // need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (Info->usesFlatScratch() && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + Info->setFlatScratchUsage(false); + } + + Info->setPrivateSegmentSize(FrameInfo.getStackSize()); + + // Assume a big number if there are any unknown sized objects. + Info->setDynamicallySizedStackUsage(FrameInfo.hasVarSizedObjects()); + if (Info->hasDynamicallySizedStack()) + Info->setPrivateSegmentSize(Info->getPrivateSegmentSize() + + AssumedStackSizeForDynamicSizeObjects); + + if (MFI->isStackRealigned()) + Info->setPrivateSegmentSize(Info->getPrivateSegmentSize() + + FrameInfo.getMaxAlign().value()); + + Info->setVCCUsage(MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI)); + + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + // A tail call isn't considered a call for MachineFrameInfo's purposes. + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { + // Info.NumVGPR = MF.getLeafVGPRUsed(MRI, TRI); + // Calculate VGPRs. + Info->setNumVGPR(getLeafFunctionVGPRsUsed(MF)); + // Calculate AGPRs. + if (ST.hasMAIInsts()) { + Info->setNumAGPR(getLeafFunctionAGPRsUsed(MF)); + } + // Calculate SGPRs. + Info->setNumExplicitSGPR(getLeafFunctionSGPRsUsed(MF)); + return std::move(Info); + } + gatherResourceInfoFromCallGraph(Info, MF); + return std::move(Info); +}