Index: lib/Target/AMDGPU/GCNIterativeScheduler.cpp =================================================================== --- lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -478,13 +478,19 @@ } LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc << ", prev occupancy = " << Occ << '\n'); + if (NewOcc > Occ) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + MFI->increaseOccupancy(MF, NewOcc); + } + return std::max(NewOcc, Occ); } void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( bool TryMaximizeOccupancy) { const auto &ST = MF.getSubtarget(); - auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + SIMachineFunctionInfo *MFI = MF.getInfo(); + auto TgtOcc = MFI->getMinAllowedOccupancy(); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); @@ -501,6 +507,7 @@ "target occupancy = " << TgtOcc << '\n'); GCNMaxOccupancySchedStrategy LStrgy(Context); + unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { // running first pass with TargetOccupancy = 0 mimics previous scheduling @@ -525,8 +532,10 @@ assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); } } + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); } } + MFI->limitOccupancy(FinalOccupancy); } /////////////////////////////////////////////////////////////////////////////// @@ -534,7 +543,8 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { const auto &ST = MF.getSubtarget(); - const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const auto TgtOcc = MFI->getOccupancy(); sortRegionsByPressure(TgtOcc); auto MaxPressure = Regions.front()->MaxPressure; @@ -567,9 +577,8 @@ void GCNIterativeScheduler::scheduleILP( bool TryMaximizeOccupancy) { const auto &ST = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF), - MFI->getMaxWavesPerEU()); + SIMachineFunctionInfo *MFI = MF.getInfo(); + auto TgtOcc = MFI->getMinAllowedOccupancy(); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); @@ -582,6 +591,7 @@ "target occupancy = " << TgtOcc << '\n'); + unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (auto R : Regions) { BuildDAG DAG(*R, *this); const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); @@ -599,6 +609,8 @@ } else { scheduleRegion(*R, ILPSchedule, RP); LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); } } + MFI->limitOccupancy(FinalOccupancy); } Index: lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.h +++ lib/Target/AMDGPU/GCNSchedStrategy.h @@ -64,7 +64,7 @@ const SISubtarget &ST; - const SIMachineFunctionInfo &MFI; + SIMachineFunctionInfo &MFI; // Occupancy target at the beginning of function scheduling cycle. unsigned StartingOccupancy; Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -308,9 +308,7 @@ ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget()), MFI(*MF.getInfo()), - StartingOccupancy(std::min(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), - MF.getFunction()), - MFI.getMaxWavesPerEU())), + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); @@ -374,16 +372,15 @@ unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); // Allow memory bound functions to drop to 4 waves if not limited by an // attribute. - unsigned MinMemBoundWaves = std::max(MFI.getMinWavesPerEU(), 4u); if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && - WavesAfter >= MinMemBoundWaves && - (MFI.isMemoryBound() || MFI.needsWaveLimiter())) { + WavesAfter >= MFI.getMinAllowedOccupancy()) { LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " - << MinMemBoundWaves << " waves\n"); + << MFI.getMinAllowedOccupancy() << " waves\n"); NewOccupancy = WavesAfter; } if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; + MFI.limitOccupancy(MinOccupancy); LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << MinOccupancy << ".\n"); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -7724,6 +7724,8 @@ MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, Info->getScratchWaveOffsetReg()); + Info->limitOccupancy(MF); + TargetLoweringBase::finalizeLowering(MF); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -186,6 +186,9 @@ unsigned HighBitsOf32BitAddress; + // Current recorded maximum possible occupancy. + unsigned Occupancy; + MCPhysReg getNextUserSGPR() const; MCPhysReg getNextSystemSGPR() const; @@ -641,6 +644,29 @@ llvm::make_unique(TII)); return PSV.first->second.get(); } + + unsigned getOccupancy() const { + return Occupancy; + } + + unsigned getMinAllowedOccupancy() const { + if (!isMemoryBound() && !needsWaveLimiter()) + return Occupancy; + return (Occupancy < 4) ? Occupancy : 4; + } + + void limitOccupancy(const MachineFunction &MF); + + void limitOccupancy(unsigned Limit) { + if (Occupancy > Limit) + Occupancy = Limit; + } + + void increaseOccupancy(const MachineFunction &MF, unsigned Limit) { + if (Occupancy < Limit) + Occupancy = Limit; + limitOccupancy(MF); + } }; } // end namespace llvm Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -55,6 +55,9 @@ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); + Occupancy = getMaxWavesPerEU(); + limitOccupancy(MF); + if (!isEntryFunction()) { // Non-entry functions have no special inputs for now, other registers // required for scratch access. @@ -176,6 +179,13 @@ S.consumeInteger(0, HighBitsOf32BitAddress); } +void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { + limitOccupancy(getMaxWavesPerEU()); + const SISubtarget& ST = MF.getSubtarget(); + limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(), + MF.getFunction())); +} + unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { ArgInfo.PrivateSegmentBuffer =