diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1757,16 +1757,17 @@ // 1. The loop contains vmem store(s), no vmem load and at least one use of a // vgpr containing a value that is loaded outside of the loop. (Only on // targets with no vscnt counter). -// 2. The loop contains vmem load(s), but the loaded values are not used in the -// loop, and at least one use of a vgpr containing a value that is loaded -// outside of the loop. +// 2. The loop contains vmem load(s), but at least one of the loaded values is +// not used in the loop, and at least one use of a vgpr containing a value +// that is loaded outside of the loop. bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets) { bool HasVMemLoad = false; bool HasVMemStore = false; bool UsesVgprLoadedOutside = false; - DenseSet VgprUse; - DenseSet VgprDef; + // Values that are defined within the loop but are not used within the same + // loop. + DenseSet LoopExtraneousDefs; for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { @@ -1784,34 +1785,24 @@ // Vgpr use if (Op.isUse()) { for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprDef.contains(RegNo)) - return false; - VgprUse.insert(RegNo); + LoopExtraneousDefs.erase(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { + if (Brackets.getRegScore(RegNo, VM_CNT) > + Brackets.getScoreLB(VM_CNT)) UsesVgprLoadedOutside = true; - break; - } } } // VMem load vgpr def else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprUse.contains(RegNo)) - return false; - VgprDef.insert(RegNo); - } + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) + LoopExtraneousDefs.insert(RegNo); } } } if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) return true; - return HasVMemLoad && UsesVgprLoadedOutside; + return LoopExtraneousDefs.size() && UsesVgprLoadedOutside; } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -482,7 +482,7 @@ $vgpr10 = COPY $vgpr0 $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) - $vgpr11 = COPY $vgpr7 + $vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit killed $scc S_BRANCH %bb.2 @@ -735,3 +735,86 @@ S_ENDPGM 0 ... + +# The loop contains a use of a value that is defined outside of the loop, and +# defines a value within the loop that is used outside of the loop. +# We expect the waitcnt to be hoisted. + +# GFX9-LABEL: waitcnt_vm_loop_def_used_outside +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_def_used_outside +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_def_used_outside +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + S_ENDPGM 0 + +... + +# Same as before, except only part of the interval is used outside of the loop. +# We expect the waitcnt to be hoisted. + +# GFX9-LABEL: waitcnt_vm_loop_def_used_outside_partial_use +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_def_used_outside_partial_use +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_def_used_outside_partial_use +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr11_vgpr12_vgpr13_vgpr14 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + S_ENDPGM 0 + +...