diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #include "GCNRegPressure.h" +#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { @@ -120,10 +121,16 @@ // Region pressure cache. SmallVector Pressure; - // List of trivially rematerializable instructions we can remat to reduce RP. - // First MI is the MI to remat and second MI is the position we should remat - // before, usually the MI using the rematerializable instruction. - SmallVector> RematerializableInsts; + // Each region at MinOccupancy will have their own list of trivially + // rematerializable instructions we can remat to reduce RP. The list maps an + // instruction to the position we should remat before, usually the MI using + // the rematerializable instruction. + MapVector> + RematerializableInsts; + + // Map a trivially remateriazable def to a list of regions at MinOccupancy + // that has the defined reg as a live-in. + DenseMap> RematDefToLiveInRegions; // Temporary basic block live-in cache. DenseMap MBBLiveIns; @@ -133,7 +140,7 @@ // Collect all trivially rematerializable VGPR instructions with a single def // and single use outside the defining block into RematerializableInsts. - void collectRematerializableInstructions(unsigned HighRPIdx); + void collectRematerializableInstructions(); bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA); @@ -141,7 +148,7 @@ // Attempt to reduce RP of VGPR by sinking trivially rematerializable // instructions. Returns true if we were able to sink instruction(s). bool sinkTriviallyRematInsts(const GCNSubtarget &ST, - const TargetInstrInfo *TII, unsigned HighRPIdx); + const TargetInstrInfo *TII); // Return current region pressure. GCNRegPressure getRealRegPressure() const; @@ -150,8 +157,11 @@ void computeBlockPressure(const MachineBasicBlock *MBB); // Update region boundaries when removing MI or inserting NewMI before MI. - void updateRegionBoundaries(MachineBasicBlock::iterator MI, - MachineInstr *NewMI, bool Removing = false); + void updateRegionBoundaries( + SmallVectorImpl> &RegionBoundaries, + MachineBasicBlock::iterator MI, MachineInstr *NewMI, + bool Removing = false); public: GCNScheduleDAGMILive(MachineSchedContext *C, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -638,7 +638,7 @@ } if (Stage == PreRARematerialize) { - if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1) + if (RegionsWithMinOcc.none() || Regions.size() == 1) break; const GCNSubtarget &ST = MF.getSubtarget(); @@ -654,10 +654,9 @@ static_assert(LastStage == PreRARematerialize, "Passes after PreRARematerialize are not supported"); - unsigned HighRPIdx = RegionsWithMinOcc.find_first(); - collectRematerializableInstructions(HighRPIdx); + collectRematerializableInstructions(); if (RematerializableInsts.empty() || - !sinkTriviallyRematInsts(ST, TII, HighRPIdx)) + !sinkTriviallyRematInsts(ST, TII)) break; LLVM_DEBUG( @@ -720,10 +719,8 @@ } while (Stage != LastStage); } -void GCNScheduleDAGMILive::collectRematerializableInstructions( - unsigned HighRPIdx) { +void GCNScheduleDAGMILive::collectRematerializableInstructions() { const SIRegisterInfo *SRI = static_cast(TRI); - const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx]; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); if (!LIS->hasInterval(Reg)) @@ -734,12 +731,6 @@ !MRI.hasOneUse(Reg)) continue; - // We are only collecting defs that are live-through or defined in another - // block and used inside this region. This means that the register must be - // in the live-in set for this region, else skip this def. - if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end()) - continue; - MachineInstr *Def = MRI.getOneDef(Reg)->getParent(); if (!Def || Def->getOperand(0).getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA)) @@ -749,82 +740,157 @@ if (Def->getParent() == UseI->getParent()) continue; - RematerializableInsts.push_back(std::make_pair(Def, UseI)); + // We are only collecting defs that are defined in another block and are + // live-through or used inside regions at MinOccupancy. This means that the + // register must be in the live-in set for the region. + for (unsigned I = 0, E = Regions.size(); I != E; ++I) { + if (!RegionsWithMinOcc[I]) + continue; + + auto It = LiveIns[I].find(Reg); + if (It != LiveIns[I].end() && !It->second.none()) { + RematerializableInsts[I][Def] = UseI; + RematDefToLiveInRegions[Def].push_back(I); + } + } } } bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, - const TargetInstrInfo *TII, - unsigned HighRPIdx) { - RescheduleRegions.reset(); - GCNRPTracker::LiveRegSet NewLiveIns; - // We may not need to rematerialize all instructions. Keep a list of - // instructions we are rematerializing at the end. - SmallVector, 4> - TrivialRematDefsToSink; - - GCNRegPressure RegionPressure = Pressure[HighRPIdx]; - int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts()); - int SGPRUsage = RegionPressure.getSGPRNum(); - - // TODO: Handle occupancy drop due to AGPR and SGPR. - // Check if cause of occupancy drop is due to VGPR usage. - if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy || - ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) - return false; + const TargetInstrInfo *TII) { + // Temporary copies of cached variables we will be modifying and replacing if + // sinking succeeds. + SmallVector< + std::pair, 32> + NewRegions; + SmallVector NewLiveIns; + SmallVector NewPressure; + BitVector NewRescheduleRegions; + + NewLiveIns.resize(Regions.size()); + NewPressure.resize(Regions.size()); + NewRegions.resize(Regions.size()); + NewRescheduleRegions.resize(Regions.size()); + + // Make copies of register pressure and live-ins cache that will be updated + // as we rematerialize. + NewPressure = Pressure; + NewLiveIns = LiveIns; + NewRegions = Regions; + NewRescheduleRegions.reset(); - NewLiveIns.copyFrom(LiveIns[HighRPIdx]); - // First check if we have enough trivially rematerializable instructions to - // improve occupancy. Optimistically assume all instructions we are able to - // sink decreased RP. - int TotalSinkableRegs = 0; - for (auto &It : RematerializableInsts) { - Register DefReg = It.first->getOperand(0).getReg(); - TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]); - } - int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; - unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); - // If in the most optimistic scenario, we cannot improve occupancy, then do - // not attempt to sink any instructions. - if (OptimisticOccupancy <= MinOccupancy) - return false; - - // Keep a list of newly rematerialized instructions so that we can easily - // undo if occupancy is not improved. DenseMap InsertedMIToOldDef; - GCNDownwardRPTracker RPT(*LIS); - auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first, - Regions[HighRPIdx].second); - unsigned ImproveOccupancy = 0; - for (auto &It : RematerializableInsts) { - MachineInstr *Def = It.first; - MachineBasicBlock::iterator InsertPos = - MachineBasicBlock::iterator(It.second); - Register Reg = Def->getOperand(0).getReg(); - // Rematerialize MI to its use block. Since we are only rematerializing - // instructions that do not have any virtual reg uses, we do not need to - // call LiveRangeEdit::allUsesAvailableAt() and - // LiveRangeEdit::canRematerializeAt(). - NewLiveIns[Reg] = LaneBitmask::getNone(); - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - Def->getOperand(0).getSubReg(), *Def, *TRI); - MachineInstr *NewMI = &*(--InsertPos); - LIS->InsertMachineInstrInMaps(*NewMI); - LIS->removeInterval(Reg); - LIS->createAndComputeVirtRegInterval(Reg); - InsertedMIToOldDef[NewMI] = Def; - - // FIXME: Need better way to update RP without re-iterating over region - RPT.reset(*NonDbgMI, &NewLiveIns); - RPT.advance(Regions[HighRPIdx].second); - GCNRegPressure RPAfterSinking = RPT.moveMaxPressure(); - ImproveOccupancy = RPAfterSinking.getOccupancy(ST); - if (ImproveOccupancy > MinOccupancy) + bool Improved = true; + for (unsigned I = 0, E = Regions.size(); I != E; ++I) { + if (!RegionsWithMinOcc[I]) + continue; + + int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); + int SGPRUsage = NewPressure[I].getSGPRNum(); + + // TODO: Handle occupancy drop due to AGPR and SGPR. + // Check if cause of occupancy drop is due to VGPR usage and not SGPR. + if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) { + Improved = false; break; + } + + // The occupancy of this region could have been improved by a previous + // iteration's sinking of defs. + if (NewPressure[I].getOccupancy(ST) > MinOccupancy) { + NewRescheduleRegions[I] = true; + continue; + } + + // First check if we have enough trivially rematerializable instructions to + // improve occupancy. Optimistically assume all instructions we are able to + // sink decreased RP. + int TotalSinkableRegs = 0; + for (const auto &It : RematerializableInsts[I]) { + MachineInstr *Def = It.first; + Register DefReg = Def->getOperand(0).getReg(); + TotalSinkableRegs += + SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); + } + int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; + unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); + // If in the most optimistic scenario, we cannot improve occupancy, then do + // not attempt to sink any instructions. + if (OptimisticOccupancy <= MinOccupancy) { + Improved = false; + break; + } + + unsigned ImproveOccupancy = 0; + SmallVector SinkedDefs; + for (auto &It : RematerializableInsts[I]) { + MachineInstr *Def = It.first; + MachineBasicBlock::iterator InsertPos = + MachineBasicBlock::iterator(It.second); + Register Reg = Def->getOperand(0).getReg(); + // Rematerialize MI to its use block. Since we are only rematerializing + // instructions that do not have any virtual reg uses, we do not need to + // call LiveRangeEdit::allUsesAvailableAt() and + // LiveRangeEdit::canRematerializeAt(). + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + Def->getOperand(0).getSubReg(), *Def, *TRI); + MachineInstr *NewMI = &*(--InsertPos); + LIS->InsertMachineInstrInMaps(*NewMI); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + InsertedMIToOldDef[NewMI] = Def; + + // Update region boundaries in scheduling region we sinked from since we + // may sink an instruction that was at the beginning or end of its region + updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, + /*Removing =*/true); + + // Update region boundaries in region we sinked to. + updateRegionBoundaries(NewRegions, InsertPos, NewMI); + + LaneBitmask PrevMask = NewLiveIns[I][Reg]; + // Update RP for all tracked high RP regions that this has this MI as a + // live-in. + for (auto TrackedIdx : RematDefToLiveInRegions[Def]) { + NewLiveIns[TrackedIdx][Reg] = LaneBitmask::getNone(); + if (InsertPos->getParent() != Regions[TrackedIdx].first->getParent()) { + // Def is live-through and not used in this block. + NewPressure[TrackedIdx].inc(Reg, PrevMask, LaneBitmask::getNone(), + MRI); + } else { + // Def is used and rematerialized into this block. + GCNDownwardRPTracker RPT(*LIS); + auto *NonDbgMI = &*skipDebugInstructionsForward( + NewRegions[TrackedIdx].first, NewRegions[TrackedIdx].second); + RPT.reset(*NonDbgMI, &NewLiveIns[TrackedIdx]); + RPT.advance(NewRegions[TrackedIdx].second); + NewPressure[TrackedIdx] = RPT.moveMaxPressure(); + } + } + + SinkedDefs.push_back(Def); + ImproveOccupancy = NewPressure[I].getOccupancy(ST); + if (ImproveOccupancy > MinOccupancy) + break; + } + + // Remove defs we just sinked from all regions' list of sinkable defs + for (auto &Def : SinkedDefs) + for (auto TrackedIdx : RematDefToLiveInRegions[Def]) + RematerializableInsts[TrackedIdx].erase(Def); + + + if (ImproveOccupancy <= MinOccupancy) { + Improved = false; + break; + } + + NewRescheduleRegions[I] = true; } - if (ImproveOccupancy <= MinOccupancy) { - // Occupancy is not improved. Undo sinking for the region + if (!Improved) { + // Occupancy was not improved for all regions that were at MinOccupancy. + // Undo sinking and remove newly rematerialized instructions. for (auto &Entry : InsertedMIToOldDef) { MachineInstr *MI = Entry.first; MachineInstr *OldMI = Entry.second; @@ -835,16 +901,13 @@ LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); } - return false; + return Improved; } - // Occupancy is improved. + // Occupancy was improved for all regions. for (auto &Entry : InsertedMIToOldDef) { MachineInstr *MI = Entry.first; MachineInstr *OldMI = Entry.second; - // Update region boundaries in scheduling region we sinked from since we - // may sink an instruction that was at the beginning or end of its region - updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true); // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. BBLiveInMap.erase(OldMI); @@ -855,27 +918,21 @@ OldMI->eraseFromParent(); LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); - - // Update region boundaries in region we sinked to. - MachineBasicBlock::iterator InsertPos = - std::next(MachineBasicBlock::iterator(MI)); - updateRegionBoundaries(InsertPos, MI); } - // Update cached live-ins and register pressure after rematerializing - LiveIns[HighRPIdx].copyFrom(NewLiveIns); - MBBLiveIns.erase(Regions[HighRPIdx].first->getParent()); - - GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second, - &LiveIns[HighRPIdx]); - Pressure[HighRPIdx] = RPTracker.moveMaxPressure(); + // Update live-ins, register pressure, and regions caches. + Regions = NewRegions; + LiveIns = NewLiveIns; + Pressure = NewPressure; + RescheduleRegions = NewRescheduleRegions; + for (unsigned I = 0, E = Regions.size(); I != E; ++I) + if (RegionsWithMinOcc[I]) + MBBLiveIns.erase(Regions[I].first->getParent()); SIMachineFunctionInfo &MFI = *MF.getInfo(); MFI.increaseOccupancy(MF, ++MinOccupancy); - RescheduleRegions[HighRPIdx] = true; - return true; + return Improved; } // Copied from MachineLICM @@ -896,34 +953,39 @@ // of a scheduling region and do not need to check the ending since we will only // ever be inserting before an already existing MI. void GCNScheduleDAGMILive::updateRegionBoundaries( + SmallVectorImpl> &RegionBoundaries, MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { - unsigned I = 0, E = Regions.size(); + unsigned I = 0, E = RegionBoundaries.size(); // Search for first region of the block where MI is located - while (I != E && MI->getParent() != Regions[I].first->getParent()) + while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) ++I; for (; I != E; ++I) { - if (MI->getParent() != Regions[I].first->getParent()) + if (MI->getParent() != RegionBoundaries[I].first->getParent()) return; - if (Removing && MI == Regions[I].first && MI == Regions[I].second) { + if (Removing && MI == RegionBoundaries[I].first && + MI == RegionBoundaries[I].second) { // MI is in a region with size 1, after removing, the region will be // size 0, set RegionBegin and RegionEnd to pass end of block iterator. - Regions[I] = + RegionBoundaries[I] = std::make_pair(MI->getParent()->end(), MI->getParent()->end()); return; } - if (MI == Regions[I].first) { + if (MI == RegionBoundaries[I].first) { if (Removing) - Regions[I] = std::make_pair(std::next(MI), Regions[I].second); + RegionBoundaries[I] = + std::make_pair(std::next(MI), RegionBoundaries[I].second); else // Inserted NewMI in front of region, set new RegionBegin to NewMI - Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), - Regions[I].second); + RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), + RegionBoundaries[I].second); return; } - if (Removing && MI == Regions[I].second) { - Regions[I] = std::make_pair(Regions[I].first, std::prev(MI)); + if (Removing && MI == RegionBoundaries[I].second) { + RegionBoundaries[I] = + std::make_pair(RegionBoundaries[I].first, std::prev(MI)); return; } } diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -432,7 +432,6 @@ S_NOP 0, implicit %20, implicit %21 S_ENDPGM 0 ... -# TODO: Handle multiple blocks --- name: test_occ_9_two_block_high_rp tracksRegLiveness: true @@ -466,19 +465,18 @@ ; GFX908-NEXT: %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit %24 + ; GFX908-NEXT: %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit %23 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -486,6 +484,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit %26 + ; GFX908-NEXT: %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit %25 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: @@ -570,7 +569,6 @@ S_NOP 0, implicit %22 S_ENDPGM 0 ... -# TODO: Handle multiple blocks --- name: test_occ_9_two_block_high_rp_minimum_sinking tracksRegLiveness: true @@ -603,7 +601,6 @@ ; GFX908-NEXT: %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -611,12 +608,12 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit %24 + ; GFX908-NEXT: %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit %23, implicit %22 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} @@ -625,6 +622,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit %27 + ; GFX908-NEXT: %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit %25, implicit %26 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: