diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1726,34 +1726,30 @@ } for (unsigned I = 0; I < MI.getNumOperands(); I++) { MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()) || (!Op.readsReg() && !Op.isDef())) continue; RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); - // Vgpr use - if (Op.isUse()) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprDef.contains(RegNo)) - return false; - VgprUse.insert(RegNo); - // If at least one of Op's registers is in the score brackets, the - // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { - UsesVgprLoadedOutside = true; - break; - } - } + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (Op.isUse()) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprDef.contains(RegNo)) + return false; + VgprUse.insert(RegNo); + // If at least one of Op's registers is in the score brackets, the + // value is likely loaded outside of the loop. + if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { + UsesVgprLoadedOutside = true; + break; + } + } else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprUse.contains(RegNo)) + return false; + VgprDef.insert(RegNo); + } } - // VMem load vgpr def - else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprUse.contains(RegNo)) - return false; - VgprDef.insert(RegNo); - } } } } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -278,6 +278,40 @@ ... --- +# Same as before but with undef uses. No flush should be generated. + +# GFX9-LABEL: waitcnt_vm_loop2_undef +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: + +# GFX10-LABEL: waitcnt_vm_loop2_undef +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +name: waitcnt_vm_loop2_undef +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 undef $vgpr0, $vgpr2, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + # Same as before with an additional store in the loop. We still expect the # waitcnt instructions to be hoisted. @@ -535,3 +569,40 @@ S_ENDPGM 0 ... +--- + +# This test case checks that we flush the vmcnt counter only if necessary +# (i.e. if a waitcnt is needed for the vgpr use we find in the loop) + +# GFX10-LABEL: waitcnt_vm_necessary +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16240 +# GFX10: renamable $vgpr4 +# GFX10-NOT: S_WAITCNT 16240 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16240 + +# GFX9-LABEL: waitcnt_vm_necessary +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 3952 +# GFX9: renamable $vgpr4 +# GFX9-NOT: S_WAITCNT 3952 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 3952 + +name: waitcnt_vm_necessary +body: | + bb.0: + successors: %bb.1(0x80000000) + + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + renamable $vgpr4 = BUFFER_LOAD_DWORD_OFFEN renamable $vgpr0, undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1(0x40000000) + + renamable $vgpr5 = BUFFER_LOAD_DWORD_OFFEN renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_ENDPGM 0 + +...