diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1739,7 +1739,10 @@ VgprUse.insert(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + unsigned Score = Brackets.getRegScore(RegNo, VM_CNT); + AMDGPU::Waitcnt Wait; + Brackets.determineWait(VM_CNT, Score, Wait); + if (Wait.hasWaitVmCnt()) { UsesVgprLoadedOutside = true; break; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -569,6 +569,10 @@ return VsCnt != ~0u; } + bool hasWaitVmCnt() const { + return VmCnt != ~0u; + } + bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -535,3 +535,40 @@ S_ENDPGM 0 ... +--- + +# This test case checks that we flush the vmcnt counter only if necessary +# (i.e. if a waitcnt is needed for the vgpr use we find in the loop) + +# GFX10-LABEL: waitcnt_vm_necessary +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16240 +# GFX10: renamable $vgpr4 +# GFX10-NOT: S_WAITCNT 16240 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16240 + +# GFX9-LABEL: waitcnt_vm_necessary +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 3952 +# GFX9: renamable $vgpr4 +# GFX9-NOT: S_WAITCNT 3952 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 3952 + +name: waitcnt_vm_necessary +body: | + bb.0: + successors: %bb.1(0x80000000) + + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + renamable $vgpr4 = BUFFER_LOAD_DWORD_OFFEN undef renamable $vgpr0, undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1(0x40000000) + + renamable $vgpr5 = BUFFER_LOAD_DWORD_OFFEN undef renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_ENDPGM 0 + +...