diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -64,6 +64,14 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { + enum : unsigned { + Collect, + InitialSchedule, + UnclusteredReschedule, + ClusteredLowOccupancyReschedule, + LastStage = ClusteredLowOccupancyReschedule + }; + const GCNSubtarget &ST; SIMachineFunctionInfo &MFI; @@ -84,6 +92,10 @@ SmallVector, 32> Regions; + // Records if a region is not yet scheduled, or schedule has been reverted, + // or we generally desire to reschedule it. + BitVector RescheduleRegions; + // Region live-in cache. SmallVector LiveIns; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -316,13 +316,13 @@ ST(MF.getSubtarget()), MFI(*MF.getInfo()), StartingOccupancy(MFI.getOccupancy()), - MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { + MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { - if (Stage == 0) { + if (Stage == Collect) { // Just record regions at the first pass. Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); return; @@ -348,6 +348,7 @@ ScheduleDAGMILive::schedule(); Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + RescheduleRegions[RegionIdx] = false; if (!LIS) return; @@ -389,20 +390,28 @@ << MinOccupancy << ".\n"); } + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + if (PressureAfter.getVGPRNum() > MaxVGPRs || + PressureAfter.getSGPRNum() > MaxSGPRs) + RescheduleRegions[RegionIdx] = true; + if (WavesAfter >= MinOccupancy) { - unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); - unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST); - if (WavesAfter > MFI.getMinWavesPerEU() || + if (Stage == UnclusteredReschedule && + !PressureAfter.less(ST, PressureBefore)) { + LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); + } else if (WavesAfter > MFI.getMinWavesPerEU() || PressureAfter.less(ST, PressureBefore) || - (TotalVGPRs >= PressureAfter.getVGPRNum() && - TotalSGPRs >= PressureAfter.getSGPRNum())) { + !RescheduleRegions[RegionIdx]) { Pressure[RegionIdx] = PressureAfter; return; + } else { + LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); } - LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); } LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + RescheduleRegions[RegionIdx] = true; RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { if (MI->isDebugInstr()) @@ -532,33 +541,55 @@ LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); + RescheduleRegions.resize(Regions.size()); + RescheduleRegions.set(); if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); + std::vector> SavedMutations; + do { Stage++; RegionIdx = 0; MachineBasicBlock *MBB = nullptr; - if (Stage > 1) { + if (Stage > InitialSchedule) { + if (!LIS) + break; + // Retry function scheduling if we found resulting occupancy and it is // lower than used for first pass scheduling. This will give more freedom // to schedule low register pressure blocks. // Code is partially copied from MachineSchedulerBase::scheduleRegions(). - if (!LIS || StartingOccupancy <= MinOccupancy) - break; + if (Stage == UnclusteredReschedule) { + if (RescheduleRegions.none()) + continue; + LLVM_DEBUG(dbgs() << + "Retrying function scheduling without clustering.\n"); + } + + if (Stage == ClusteredLowOccupancyReschedule) { + if (StartingOccupancy <= MinOccupancy) + break; - LLVM_DEBUG( - dbgs() - << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); + LLVM_DEBUG( + dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); - S.setTargetOccupancy(MinOccupancy); + S.setTargetOccupancy(MinOccupancy); + } } + if (Stage == UnclusteredReschedule) + SavedMutations.swap(Mutations); + for (auto Region : Regions) { + if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) + continue; + RegionBegin = Region.first; RegionEnd = Region.second; @@ -566,7 +597,7 @@ if (MBB) finishBlock(); MBB = RegionBegin->getParent(); startBlock(MBB); - if (Stage == 1) + if (Stage == InitialSchedule) computeBlockPressure(MBB); } @@ -594,5 +625,7 @@ } finishBlock(); - } while (Stage < 2); + if (Stage == UnclusteredReschedule) + SavedMutations.swap(Mutations); + } while (Stage != LastStage); } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Interleave loads and stores to fit into 9 VGPR limit. +; This requires to avoid load/store clustering. + +; GCN: global_load_dwordx4 +; GCN: global_store_dwordx4 +; GCN: global_load_dwordx4 +; GCN: global_store_dwordx4 +; GCN: global_load_dwordx4 +; GCN: global_store_dwordx4 +; GCN: NumVgprs: {{[0-9]$}} +; GCN: ScratchSize: 0{{$}} + +define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id + %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1 + %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4 + %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4 + %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5 + %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4 + store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4 + %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4 + %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5 + store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { "amdgpu-num-vgpr"="9" }