diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -26,11 +26,18 @@ #define DEBUG_TYPE "amdgpu-set-wave-priority" +static cl::opt VALUInstsThreshold( + "amdgpu-set-wave-priority-valu-insts-threshold", + cl::desc("VALU instruction count threshold for adjusting wave priority"), + cl::init(100), cl::Hidden); + namespace { struct MBBInfo { MBBInfo() = default; bool MayReachVMEMLoad = false; + unsigned NumVALUInsts = 0; + MachineInstr *LastVMEMLoad = nullptr; }; using MBBInfoSet = DenseMap; @@ -46,7 +53,9 @@ bool runOnMachineFunction(MachineFunction &MF) override; private: - MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned priority) const; const SIInstrInfo *TII; }; @@ -62,9 +71,12 @@ return new AMDGPUSetWavePriority(); } -MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, - unsigned priority) const { - return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +MachineInstr * +AMDGPUSetWavePriority::BuildSetprioMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned priority) const { + return BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)) + .addImm(priority); } // Checks that for every predecessor Pred that can reach a VMEM load, @@ -82,10 +94,6 @@ return true; } -static bool isVMEMLoad(const MachineInstr &MI) { - return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); -} - bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { const unsigned HighPriority = 3; const unsigned LowPriority = 0; @@ -97,14 +105,41 @@ const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); - MBBInfoSet MBBInfos; + // Find VMEM loads that may be executed before the number of executed + // VALU instructions hits the specified threshold. We currently assume + // that backedges/loops, branch probabilities and other details can be + // ignored, so we essentially determine the largest number of VALU + // instructions along every possible path from the start of the + // function that may potentially be executed provided no backedge is + // ever taken. SmallVector Worklist; - for (MachineBasicBlock &MBB : MF) { - if (any_of(MBB, isVMEMLoad)) - Worklist.push_back(&MBB); + MBBInfoSet MBBInfos; + ReversePostOrderTraversal RPOT(&MF); + for (MachineBasicBlock *MBB : RPOT) { + unsigned NumVALUInsts = 0; + for (const MachineBasicBlock *Pred : MBB->predecessors()) + NumVALUInsts = std::max(NumVALUInsts, MBBInfos[Pred].NumVALUInsts); + + MachineInstr *LastVMEMLoad = nullptr; + for (MachineInstr &MI : *MBB) { + if (NumVALUInsts >= VALUInstsThreshold) + break; + if (SIInstrInfo::isVMEM(MI) && MI.mayLoad()) + LastVMEMLoad = &MI; + if (SIInstrInfo::isVALU(MI)) + ++NumVALUInsts; + } + + MBBInfo &Info = MBBInfos[MBB]; + Info.NumVALUInsts = NumVALUInsts; + + if (LastVMEMLoad) { + Info.LastVMEMLoad = LastVMEMLoad; + Worklist.push_back(MBB); + } } - // Mark blocks from which control may reach VMEM loads. + // Mark blocks from which control may reach the VMEM loads. while (!Worklist.empty()) { const MachineBasicBlock *MBB = Worklist.pop_back_val(); MBBInfo &Info = MBBInfos[MBB]; @@ -122,10 +157,10 @@ MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) ++I; - Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + BuildSetprioMI(Entry, I, HighPriority); // Lower the priority on edges where control leaves blocks from which - // VMEM loads are reachable. + // the VMEM loads are reachable. SmallSet PriorityLoweringBlocks; for (MachineBasicBlock &MBB : MF) { if (MBBInfos[&MBB].MayReachVMEMLoad) { @@ -152,14 +187,12 @@ } for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { - MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); - while (I != B) { - if (isVMEMLoad(*--I)) { - ++I; - break; - } - } - MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + BuildSetprioMI( + *MBB, + MBBInfos[MBB].LastVMEMLoad + ? std::next(MachineBasicBlock::iterator(MBBInfos[MBB].LastVMEMLoad)) + : MBB->begin(), + LowPriority); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll --- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \ -; RUN: FileCheck %s +; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true \ +; RUN: -amdgpu-set-wave-priority-valu-insts-threshold=4 -o - %s | FileCheck %s ; CHECK-LABEL: no_setprio: ; CHECK-NOT: s_setprio @@ -150,4 +150,33 @@ ret <2 x float> %sum } +; CHECK-LABEL: valu_insts_threshold: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK-COUNT-4: v_add_f32_e32 +; CHECK: s_cbranch_scc0 [[A:.*]] +; CHECK: {{.*}}: ; %b +; CHECK-NEXT: buffer_load_dwordx2 +; CHECK: s_branch [[END:.*]] +; CHECK: [[A]]: ; %a +; CHECK: s_branch [[END]] +; CHECK: [[END]]: +define amdgpu_ps <2 x float> @valu_insts_threshold(<4 x i32> inreg %p, i32 inreg %i) { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %add = fadd <2 x float> %v, %v + %add2 = fadd <2 x float> %add, %add + + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> %add2 + +b: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0) + %sub = fsub <2 x float> %add2, %v2 + ret <2 x float> %sub +} + declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind