diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -58,6 +58,8 @@ void initializeTargetID(const Module &M); + bool doInitialization(Module &M) override; + SIProgramInfo CurrentProgramInfo; DenseMap CallGraphResourceInfo; @@ -101,6 +103,11 @@ explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); + // To memoize max SGPR usage of non-kernel functions of the module. + unsigned NonKernelMaxSGPRs = 0; + // To memoize max VGPR usage of non-kernel functions of the module. + unsigned NonKernelMaxVGPRs = 0; + StringRef getPassName() const override; const MCSubtargetInfo* getGlobalSTI() const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -627,6 +627,21 @@ return false; } +bool AMDGPUAsmPrinter::doInitialization(Module &M) { + NonKernelMaxSGPRs = 0; + NonKernelMaxVGPRs = 0; + // Compute upper bound on the number of SGPRs and VGPRs + // for non-kernel functions. + for (const Function &F : M) { + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { + const GCNSubtarget &STM = TM.getSubtarget(F); + NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, STM.getMaxNumSGPRs(F)); + NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, STM.getMaxNumVGPRs(F)); + } + } + return AsmPrinter::doInitialization(M); +} + // TODO: Fold this into emitFunctionBodyStart. void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', @@ -1020,14 +1035,20 @@ AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) report_fatal_error("invalid call to entry function"); - // If this is a call to an external function, we can't do much. Make - // conservative guesses. - - // 48 SGPRs - vcc, - flat_scr, -xnack - int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); - MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); - MaxVGPR = std::max(MaxVGPR, 23); + unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( + TM.getMCSubtargetInfo(), false, ST.hasFlatAddressSpace()); + // If this is a call to an external function, we put the + // max values computed in doInitialization(). + // Subtract extra SGPRs in case of indirect calls. + // For indirect calls, we take the max for the module + // and use that as the register budget for functions + // which makes an indirect calls. This max value + // includes extra SGPRs too (e.g. flatscratch and vcc). + // which are getting added later. + // Subtract them here so that they don't get added twice. + MaxSGPR = NonKernelMaxSGPRs - ExtraSGPRs - 1; + MaxVGPR = NonKernelMaxVGPRs - 1; + // TODO: handle AGPRs MaxAGPR = std::max(MaxAGPR, 23); CalleeFrameSize = std::max(CalleeFrameSize, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -698,12 +698,12 @@ return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); } -unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { - const SIMachineFunctionInfo &MFI = *MF.getInfo(); +unsigned +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - if (MFI.hasFlatScratchInit()) { + if (HasFlatScratchInit) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) @@ -715,6 +715,26 @@ return 2; // VCC. } +unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { + // The logic to detect if the function has + // flat scratch init is same as how MachineFunctionInfo derives. + bool FunctionHasFlatScratchInit = false; + bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) && + (isAmdHsaOrMesa(F) || enableFlatScratch()) && + !flatScratchIsArchitected()) { + if (HasCalls || HasStackObjects || enableFlatScratch()) + FunctionHasFlatScratchInit = true; + } + return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); +} + unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { @@ -728,13 +748,11 @@ return Occupancy; } -unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - +unsigned GCNSubtarget::getBaseMaxNumSGPRs( + const Function &F, std::pair WavesPerEU, + unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { // Compute maximum number of SGPRs function can use using default/requested // minimum number of waves per execution unit. - std::pair WavesPerEU = MFI.getWavesPerEU(); unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); @@ -745,7 +763,7 @@ F, "amdgpu-num-sgpr", MaxNumSGPRs); // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getReservedNumSGPRs(MF))) + if (Requested && (Requested <= ReservedNumSGPRs)) Requested = 0; // If more SGPRs are required to support the input user/system SGPRs, @@ -755,7 +773,7 @@ // of reserved special registers in total. Theoretically you could re-use // the last input registers for these special registers, but this would // require a lot of complexity to deal with the weird aliasing. - unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); + unsigned InputNumSGPRs = PreloadedSGPRs; if (Requested && Requested < InputNumSGPRs) Requested = InputNumSGPRs; @@ -774,17 +792,43 @@ if (hasSGPRInitBug()) MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), - MaxAddressableNumSGPRs); + return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); } -unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { +unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), + getReservedNumSGPRs(MF)); +} + +static unsigned getMaxNumPreloadedSGPRs() { + // Max number of user SGPRs + unsigned MaxUserSGPRs = 4 + // private segment buffer + 2 + // Dispatch ptr + 2 + // queue ptr + 2 + // kernel segment ptr + 2 + // dispatch ID + 2 + // flat scratch init + 2; // Implicit buffer ptr + // Max number of system SGPRs + unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset + return MaxUserSGPRs + MaxSystemSGPRs; +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { + return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), + getReservedNumSGPRs(F)); +} +unsigned GCNSubtarget::getBaseMaxNumVGPRs( + const Function &F, std::pair WavesPerEU) const { // Compute maximum number of VGPRs function can use using default/requested // minimum number of waves per execution unit. - std::pair WavesPerEU = MFI.getWavesPerEU(); unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); // Check if maximum number of VGPRs was explicitly requested using @@ -811,6 +855,16 @@ return MaxNumVGPRs; } +unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { + return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); +} + void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const { if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1033,9 +1033,24 @@ return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); } - /// \returns Reserved number of SGPRs for given function \p MF. + /// \returns Reserved number of SGPRs. This is common + /// utility function called by MachineFunction and + /// Function variants of getReservedNumSGPRs. + unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; + /// \returns Reserved number of SGPRs for given machine function \p MF. unsigned getReservedNumSGPRs(const MachineFunction &MF) const; + /// \returns Reserved number of SGPRs for given function \p F. + unsigned getReservedNumSGPRs(const Function &F) const; + + /// \returns max num SGPRs. This is the common utility + /// function called by MachineFunction and Function + /// variants of getMaxNumSGPRs. + unsigned getBaseMaxNumSGPRs(const Function &F, + std::pair WavesPerEU, + unsigned PreloadedSGPRs, + unsigned ReservedNumSGPRs) const; + /// \returns Maximum number of SGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of SGPRs explicitly /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. @@ -1046,6 +1061,16 @@ /// unit requirement. unsigned getMaxNumSGPRs(const MachineFunction &MF) const; + /// \returns Maximum number of SGPRs that meets number of waves per execution + /// unit requirement for function \p F, or number of SGPRs explicitly + /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumSGPRs(const Function &F) const; + /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { return AMDGPU::IsaInfo::getVGPRAllocGranule(this); @@ -1078,6 +1103,20 @@ return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); } + /// \returns max num VGPRs. This is the common utility function + /// called by MachineFunction and Function variants of getMaxNumVGPRs. + unsigned getBaseMaxNumVGPRs(const Function &F, + std::pair WavesPerEU) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution + /// unit requirement for function \p F, or number of VGPRs explicitly + /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumVGPRs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -154,21 +154,22 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GFX908: .amdhsa_next_free_vgpr 24 -; GFX90A: .amdhsa_next_free_vgpr 48 -; GFX90A: .amdhsa_accum_offset 24 -; GCN: NumVgprs: 24 +; GFX908: .amdhsa_next_free_vgpr 128 +; GFX90A: .amdhsa_next_free_vgpr 280 +; GFX90A: .amdhsa_accum_offset 256 +; GCN908: NumVgprs: 128 +; GCN90A: NumVgprs: 256 ; GCN: NumAgprs: 24 -; GFX908: TotalNumVgprs: 24 -; GFX90A: TotalNumVgprs: 48 -; GFX908: VGPRBlocks: 5 -; GFX90A: VGPRBlocks: 5 -; GFX908: NumVGPRsForWavesPerEU: 24 -; GFX90A: NumVGPRsForWavesPerEU: 48 -; GFX90A: AccumOffset: 24 -; GFX908: Occupancy: 10 -; GFX90A: Occupancy: 8 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5 +; GFX908: TotalNumVgprs: 128 +; GFX90A: TotalNumVgprs: 280 +; GFX908: VGPRBlocks: 31 +; GFX90A: VGPRBlocks: 34 +; GFX908: NumVGPRsForWavesPerEU: 128 +; GFX90A: NumVGPRsForWavesPerEU: 280 +; GFX90A: AccumOffset: 256 +; GFX908: Occupancy: 2 +; GFX90A: Occupancy: 1 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -147,9 +147,9 @@ ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} +; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}} ; SDAG-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} -; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} +; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}} ; GISEL-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -227,10 +227,10 @@ ; Make sure there's no assert when a sgpr96 is used. ; GCN-LABEL: {{^}}count_use_sgpr96_external_call ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 48 -; VI-NOBUG: NumSgprs: 48 +; CI: NumSgprs: 102 +; VI-NOBUG: NumSgprs: 102 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 64 define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -241,10 +241,10 @@ ; Make sure there's no assert when a sgpr160 is used. ; GCN-LABEL: {{^}}count_use_sgpr160_external_call ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 48 -; VI-NOBUG: NumSgprs: 48 +; CI: NumSgprs: 102 +; VI-NOBUG: NumSgprs: 102 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 64 define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -255,10 +255,10 @@ ; Make sure there's no assert when a vgpr160 is used. ; GCN-LABEL: {{^}}count_use_vgpr160_external_call ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 48 -; VI-NOBUG: NumSgprs: 48 +; CI: NumSgprs: 102 +; VI-NOBUG: NumSgprs: 102 ; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 24 +; GCN: NumVgprs: 64 define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -15,8 +15,8 @@ ; GCN-NEXT: amd_machine_version_stepping = 0 ; GCN-NEXT: kernel_code_entry_byte_offset = 256 ; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 7 -; GCN-NEXT: granulated_wavefront_sgpr_count = 5 +; GCN-NEXT: granulated_workitem_vgpr_count = 15 +; GCN-NEXT: granulated_wavefront_sgpr_count = 12 ; GCN-NEXT: priority = 0 ; GCN-NEXT: float_mode = 240 ; GCN-NEXT: priv = 0 @@ -59,8 +59,8 @@ ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 0 ; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 48 -; GCN-NEXT: workitem_vgpr_count = 32 +; GCN-NEXT: wavefront_sgpr_count = 102 +; GCN-NEXT: workitem_vgpr_count = 64 ; GCN-NEXT: reserved_vgpr_first = 0 ; GCN-NEXT: reserved_vgpr_count = 0 ; GCN-NEXT: reserved_sgpr_first = 0 @@ -111,8 +111,8 @@ ; GCN-NEXT: amd_machine_version_stepping = 0 ; GCN-NEXT: kernel_code_entry_byte_offset = 256 ; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 7 -; GCN-NEXT: granulated_wavefront_sgpr_count = 5 +; GCN-NEXT: granulated_workitem_vgpr_count = 15 +; GCN-NEXT: granulated_wavefront_sgpr_count = 12 ; GCN-NEXT: priority = 0 ; GCN-NEXT: float_mode = 240 ; GCN-NEXT: priv = 0 @@ -155,8 +155,8 @@ ; GCN-NEXT: gds_segment_byte_size = 0 ; GCN-NEXT: kernarg_segment_byte_size = 0 ; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 48 -; GCN-NEXT: workitem_vgpr_count = 32 +; GCN-NEXT: wavefront_sgpr_count = 102 +; GCN-NEXT: workitem_vgpr_count = 64 ; GCN-NEXT: reserved_vgpr_first = 0 ; GCN-NEXT: reserved_vgpr_count = 0 ; GCN-NEXT: reserved_sgpr_first = 0