diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -493,7 +493,7 @@ bool TryMaximizeOccupancy) { const auto &ST = MF.getSubtarget(); SIMachineFunctionInfo *MFI = MF.getInfo(); - auto TgtOcc = MFI->getMinAllowedOccupancy(); + auto TgtOcc = MFI->getMinAllowedOccupancy(ST); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); @@ -581,7 +581,7 @@ bool TryMaximizeOccupancy) { const auto &ST = MF.getSubtarget(); SIMachineFunctionInfo *MFI = MF.getInfo(); - auto TgtOcc = MFI->getMinAllowedOccupancy(); + auto TgtOcc = MFI->getMinAllowedOccupancy(ST); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -358,9 +358,9 @@ // Allow memory bound functions to drop to 4 waves if not limited by an // attribute. if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && - WavesAfter >= MFI.getMinAllowedOccupancy()) { + WavesAfter >= MFI.getMinAllowedOccupancy(ST)) { LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " - << MFI.getMinAllowedOccupancy() << " waves\n"); + << MFI.getMinAllowedOccupancy(ST) << " waves\n"); NewOccupancy = WavesAfter; } if (NewOccupancy < MinOccupancy) { diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -251,7 +251,7 @@ RPT.advanceToNext(); GCNRegPressure MaxPressure = RPT.moveMaxPressure(); unsigned Occupancy = MaxPressure.getOccupancy(*ST); - if (Occupancy >= MFI->getMinAllowedOccupancy() && + if (Occupancy >= MFI->getMinAllowedOccupancy(*ST) && MaxPressure.getVGPRNum() <= MaxVGPRs && MaxPressure.getSGPRNum() <= MaxSGPRs) { LastRecordedOccupancy = Occupancy; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -36,6 +36,7 @@ namespace llvm { +class GCNSubtarget; class MachineFrameInfo; class MachineFunction; class TargetRegisterClass; @@ -915,11 +916,7 @@ return Occupancy; } - unsigned getMinAllowedOccupancy() const { - if (!isMemoryBound() && !needsWaveLimiter()) - return Occupancy; - return (Occupancy < 4) ? Occupancy : 4; - } + unsigned getMinAllowedOccupancy(const GCNSubtarget &ST) const; void limitOccupancy(const MachineFunction &MF); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -190,6 +190,17 @@ S.consumeInteger(0, GDSSize); } +unsigned SIMachineFunctionInfo::getMinAllowedOccupancy(const GCNSubtarget &ST) const { + if (!isMemoryBound() && !needsWaveLimiter()) + return Occupancy; + // Allow a minimum of 16 threads per SIMD lane, which works out as: + // - 4 waves per SIMD for GFX9 and below + // - 8 waves per SIMD for GFX10 wave64 + // - 16 waves per SIMD for GFX10 wave32 + unsigned MinOccupancy = ST.getTotalNumVGPRs() / 64; + return std::min(Occupancy, MinOccupancy); +} + void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { limitOccupancy(getMaxWavesPerEU()); const GCNSubtarget& ST = MF.getSubtarget();