diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1737,7 +1737,7 @@ VgprUse.insert(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { UsesVgprLoadedOutside = true; break; } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -535,3 +535,40 @@ S_ENDPGM 0 ... +--- + +# This test case checks that we flush the vmcnt counter only if necessary +# (i.e. if a waitcnt is needed for the vgpr use we find in the loop) + +# GFX10-LABEL: waitcnt_vm_necessary +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16240 +# GFX10: $vgpr4 +# GFX10-NOT: S_WAITCNT +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT + +# GFX9-LABEL: waitcnt_vm_necessary +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 3952 +# GFX9: $vgpr4 +# GFX9-NOT: S_WAITCNT +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT + +name: waitcnt_vm_necessary +body: | + bb.0: + successors: %bb.1(0x80000000) + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1(0x40000000) + + $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_ENDPGM 0 + +...