diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -26,11 +26,18 @@ #define DEBUG_TYPE "amdgpu-set-wave-priority" +static cl::opt DefaultVALUInstsThreshold( + "amdgpu-set-wave-priority-valu-insts-threshold", + cl::desc("VALU instruction count threshold for adjusting wave priority"), + cl::init(100), cl::Hidden); + namespace { struct MBBInfo { MBBInfo() = default; + unsigned NumVALUInstsAtStart = 0; bool MayReachVMEMLoad = false; + MachineInstr *LastVMEMLoad = nullptr; }; using MBBInfoSet = DenseMap; @@ -46,7 +53,9 @@ bool runOnMachineFunction(MachineFunction &MF) override; private: - MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned priority) const; const SIInstrInfo *TII; }; @@ -62,9 +71,12 @@ return new AMDGPUSetWavePriority(); } -MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, - unsigned priority) const { - return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +MachineInstr * +AMDGPUSetWavePriority::BuildSetprioMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned priority) const { + return BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)) + .addImm(priority); } // Checks that for every predecessor Pred that can reach a VMEM load, @@ -97,21 +109,58 @@ const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); + unsigned VALUInstsThreshold = DefaultVALUInstsThreshold; + Attribute A = F.getFnAttribute("amdgpu-wave-priority-threshold"); + if (A.isValid()) + A.getValueAsString().getAsInteger(0, VALUInstsThreshold); + + // Find VMEM loads that may be executed before long-enough sequences of + // VALU instructions. We currently assume that backedges/loops, branch + // probabilities and other details can be ignored, so we essentially + // determine the largest number of VALU instructions along every + // possible path from the start of the function that may potentially be + // executed provided no backedge is ever taken. MBBInfoSet MBBInfos; - SmallVector Worklist; - for (MachineBasicBlock &MBB : MF) { - if (any_of(MBB, isVMEMLoad)) - Worklist.push_back(&MBB); - } - - // Mark blocks from which control may reach VMEM loads. - while (!Worklist.empty()) { - const MachineBasicBlock *MBB = Worklist.pop_back_val(); + for (MachineBasicBlock *MBB : post_order(&MF)) { MBBInfo &Info = MBBInfos[MBB]; - if (!Info.MayReachVMEMLoad) { - Info.MayReachVMEMLoad = true; - Worklist.append(MBB->pred_begin(), MBB->pred_end()); + bool AtStart = true; + unsigned MaxNumVALUInstsInMiddle = 0; + unsigned NumVALUInstsAtEnd = 0; + for (MachineInstr &MI : *MBB) { + if (isVMEMLoad(MI)) { + AtStart = false; + Info.NumVALUInstsAtStart = 0; + MaxNumVALUInstsInMiddle = 0; + NumVALUInstsAtEnd = 0; + Info.LastVMEMLoad = &MI; + } else if (SIInstrInfo::isDS(MI)) { + AtStart = false; + MaxNumVALUInstsInMiddle = + std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd); + NumVALUInstsAtEnd = 0; + } else if (SIInstrInfo::isVALU(MI)) { + if (AtStart) + ++Info.NumVALUInstsAtStart; + ++NumVALUInstsAtEnd; + } } + + bool SuccsMayReachVMEMLoad = false; + unsigned NumFollowingVALUInsts = 0; + for (const MachineBasicBlock *Succ : MBB->successors()) { + SuccsMayReachVMEMLoad |= MBBInfos[Succ].MayReachVMEMLoad; + NumFollowingVALUInsts = + std::max(NumFollowingVALUInsts, MBBInfos[Succ].NumVALUInstsAtStart); + } + if (AtStart) + Info.NumVALUInstsAtStart += NumFollowingVALUInsts; + NumVALUInstsAtEnd += NumFollowingVALUInsts; + + unsigned MaxNumVALUInsts = + std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd); + Info.MayReachVMEMLoad = + SuccsMayReachVMEMLoad || + (Info.LastVMEMLoad && MaxNumVALUInsts >= VALUInstsThreshold); } MachineBasicBlock &Entry = MF.front(); @@ -122,10 +171,10 @@ MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) ++I; - Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + BuildSetprioMI(Entry, I, HighPriority); // Lower the priority on edges where control leaves blocks from which - // VMEM loads are reachable. + // the VMEM loads are reachable. SmallSet PriorityLoweringBlocks; for (MachineBasicBlock &MBB : MF) { if (MBBInfos[&MBB].MayReachVMEMLoad) { @@ -152,14 +201,12 @@ } for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { - MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); - while (I != B) { - if (isVMEMLoad(*--I)) { - ++I; - break; - } - } - MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + BuildSetprioMI( + *MBB, + MBBInfos[MBB].LastVMEMLoad + ? std::next(MachineBasicBlock::iterator(MBBInfos[MBB].LastVMEMLoad)) + : MBB->begin(), + LowPriority); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll --- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -4,8 +4,9 @@ ; CHECK-LABEL: no_setprio: ; CHECK-NOT: s_setprio ; CHECK: ; return to shader part epilog -define amdgpu_ps <2 x float> @no_setprio() { - ret <2 x float> +define amdgpu_ps <2 x float> @no_setprio(<2 x float> %a, <2 x float> %b) "amdgpu-wave-priority-threshold"="1" { + %s = fadd <2 x float> %a, %b + ret <2 x float> %s } ; CHECK-LABEL: vmem_in_exit_block: @@ -13,9 +14,10 @@ ; CHECK: buffer_load_dwordx2 ; CHECK-NEXT: s_setprio 0 ; CHECK: ; return to shader part epilog -define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) { +define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) - ret <2 x float> %v + %s = fadd <2 x float> %v, %x + ret <2 x float> %s } ; CHECK-LABEL: branch: @@ -29,7 +31,7 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[EXIT]] ; CHECK-NEXT: [[EXIT]]: -define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) { +define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" { %cond = icmp eq i32 %i, 0 br i1 %cond, label %a, label %b @@ -38,7 +40,8 @@ b: %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) - ret <2 x float> %v + %s = fadd <2 x float> %v, %x + ret <2 x float> %s } ; CHECK-LABEL: setprio_follows_setprio: @@ -48,7 +51,7 @@ ; CHECK: {{.*}}: ; %a ; CHECK: buffer_load_dwordx2 ; CHECK-NEXT: s_setprio 0 -; CHECK: s_cbranch_scc1 [[C]] +; CHECK: s_cbranch_vccnz [[C]] ; CHECK: {{.*}}: ; %b ; CHECK-NOT: s_setprio ; CHECK: s_branch [[EXIT:.*]] @@ -56,7 +59,7 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[EXIT]] ; CHECK: [[EXIT]]: -define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) { +define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="3" { entry: %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) %cond1 = icmp ne i32 %i, 0 @@ -64,15 +67,16 @@ a: %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) - %cond2 = icmp ne i32 %i, 1 + %v20 = extractelement <2 x float> %v2, i32 0 + %v21 = extractelement <2 x float> %v2, i32 1 + %cond2 = fcmp ult float %v20, %v21 br i1 %cond2, label %b, label %c b: ret <2 x float> %v2 c: - %v3 = phi <2 x float> [%v1, %entry], [%v2, %a] - %v4 = fadd <2 x float> %v1, %v3 + %v4 = fadd <2 x float> %v1, %v1 ret <2 x float> %v4 } @@ -87,7 +91,7 @@ ; CHECK: s_cbranch_scc1 [[LOOP]] ; CHECK-NEXT: {{.*}}: ; %exit ; CHECK-NEXT: s_setprio 0 -define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) { +define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) "amdgpu-wave-priority-threshold"="2" { entry: br label %loop @@ -125,7 +129,7 @@ ; CHECK-NEXT: s_setprio 0 ; CHECK: s_branch [[RET]] ; CHECK: [[RET]]: -define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) { +define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) "amdgpu-wave-priority-threshold"="2" { entry: %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) %cond = icmp ne i32 %x, 0 @@ -150,4 +154,53 @@ ret <2 x float> %sum } +; CHECK-LABEL: valu_insts_threshold: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK-COUNT-4: v_add_f32_e32 +; CHECK: s_cbranch_scc0 [[A:.*]] +; CHECK: {{.*}}: ; %b +; CHECK-NEXT: buffer_load_dwordx2 +; CHECK: s_branch [[END:.*]] +; CHECK: [[A]]: ; %a +; CHECK: s_branch [[END]] +; CHECK: [[END]]: +define amdgpu_ps <2 x float> @valu_insts_threshold(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="4" { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %add = fadd <2 x float> %v, %v + %add2 = fadd <2 x float> %add, %add + + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> %add2 + +b: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0) + %sub = fsub <2 x float> %add2, %v2 + ret <2 x float> %sub +} + +; CHECK-LABEL: valu_insts_threshold2: +; CHECK-NOT: s_setprio +; CHECK: ; -- End function +define amdgpu_ps <2 x float> @valu_insts_threshold2(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="5" { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %add = fadd <2 x float> %v, %v + %add2 = fadd <2 x float> %add, %add + + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> %add2 + +b: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0) + %sub = fsub <2 x float> %add2, %v2 + ret <2 x float> %sub +} + declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind