diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -38,6 +38,7 @@ #define GET_SUBTARGETINFO_CTOR #define AMDGPUSubtarget GCNSubtarget #include "AMDGPUGenSubtargetInfo.inc" +#include "GCNSubtarget.h" #undef AMDGPUSubtarget static cl::opt EnablePowerSched( @@ -673,6 +674,22 @@ return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); } +unsigned llvm::GCNSubtarget::getOccupancyWithNumVGPRsForFunction( + const MachineFunction &MF, unsigned VGPRs) const { + const Function &F = MF.getFunction(); + std::pair WavesPerEU = getWavesPerEU(F); + unsigned MaxWaves = WavesPerEU.second; + unsigned Granule = getVGPRAllocGranule(); + if (VGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = alignTo(VGPRs, Granule); + unsigned RequestedPerAttr = getFunctionNVGPRAttr(F, WavesPerEU); + unsigned NumVGPRAvailable = getTotalNumVGPRs(); + if (RequestedPerAttr && RequestedPerAttr < NumVGPRAvailable) + NumVGPRAvailable = RequestedPerAttr; + return std::min(std::max(NumVGPRAvailable / RoundedRegs, 1u), MaxWaves); +} + unsigned GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) @@ -806,9 +823,21 @@ // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. + unsigned Requested = getFunctionNVGPRAttr(F, WavesPerEU); + + if (Requested) + MaxNumVGPRs = Requested; + + return MaxNumVGPRs; +} + +unsigned llvm::GCNSubtarget::getFunctionNVGPRAttr( + const Function &F, std::pair WavesPerEU) const { + + unsigned Requested = 0; if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", WavesPerEU.first); if (hasGFX90AInsts()) Requested *= 2; @@ -817,15 +846,11 @@ // default/requested minimum/maximum number of waves per execution unit. if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) + if (WavesPerEU.second && Requested && + Requested < getMinNumVGPRs(WavesPerEU.second)) Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; } - - return MaxNumVGPRs; + return Requested; } unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -930,10 +930,18 @@ unsigned TargetOccupancy = std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); - unsigned WavesAfter = - std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); - unsigned WavesBefore = - std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); + + unsigned WavesAfter = std::min( + TargetOccupancy, + std::min(ST.getOccupancyWithNumSGPRs(PressureAfter.getSGPRNum()), + ST.getOccupancyWithNumVGPRsForFunction( + MF, PressureAfter.getVGPRNum(ST.hasGFX90AInsts())))); + unsigned WavesBefore = std::min( + TargetOccupancy, + std::min(ST.getOccupancyWithNumSGPRs(PressureBefore.getSGPRNum()), + ST.getOccupancyWithNumVGPRsForFunction( + MF, PressureBefore.getVGPRNum(ST.hasGFX90AInsts())))); + LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1142,6 +1142,9 @@ /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; + unsigned getOccupancyWithNumVGPRsForFunction(const MachineFunction &MF, + unsigned VGPRs) const; + /// Return occupancy for the given function. Used LDS and a number of /// registers if provided. /// Note, occupancy can be affected by the scratch allocation as well, but @@ -1286,6 +1289,8 @@ /// called by MachineFunction and Function variants of getMaxNumVGPRs. unsigned getBaseMaxNumVGPRs(const Function &F, std::pair WavesPerEU) const; + unsigned getFunctionNVGPRAttr(const Function &F, + std::pair WavesPerEU) const; /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p F, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -6,17 +6,17 @@ define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 { ; GFX908-LABEL: remat_constant_voids_spill: ; GFX908: ; %bb.0: -; GFX908-NEXT: v_accvgpr_write_b32 a1, 1 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 6 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 7 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 8 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 9 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 2 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 1 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 2 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 3 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 4 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_write_b32 a1, 5 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 5 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 6 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 7 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 8 +; GFX908-NEXT: v_accvgpr_write_b32 a4, 9 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -140,10 +140,10 @@ ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_readlane_b32 s16, v40, 22 +; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 -; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX906-NEXT: v_readlane_b32 s17, v40, 23 @@ -515,10 +515,10 @@ ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_readlane_b32 s16, v40, 22 +; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 -; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX908-NEXT: v_readlane_b32 s17, v40, 23