diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -141,8 +141,13 @@ VMEM_BVH }; +static bool updateVMCntOnly(const MachineInstr &Inst) { + return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || + SIInstrInfo::isFLATScratch(Inst); +} + VmemType getVmemType(const MachineInstr &Inst) { - assert(SIInstrInfo::isVMEM(Inst)); + assert(updateVMCntOnly(Inst)); if (!SIInstrInfo::isMIMG(Inst)) return VMEM_NOSAMPLER; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); @@ -683,7 +688,7 @@ if (T == VM_CNT) { if (Interval.first >= NUM_ALL_VGPRS) continue; - if (SIInstrInfo::isVMEM(Inst)) { + if (updateVMCntOnly(Inst)) { VmemType V = getVmemType(Inst); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) VgprVmemTypes[RegNo] |= 1 << V; @@ -1182,7 +1187,7 @@ // previous write and this write are the same type of VMEM // instruction, in which case they're guaranteed to write their // results in order anyway. - if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || + if (Op.isUse() || !updateVMCntOnly(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, getVmemType(MI))) { ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -476,9 +476,7 @@ ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc -; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc -; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V3-NEXT: s_waitcnt vmcnt(0) @@ -495,9 +493,7 @@ ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc -; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc -; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -515,9 +511,7 @@ ; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc -; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: v_mov_b32_e32 v0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -534,7 +534,6 @@ ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB3_3: ; %T -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc @@ -706,7 +705,6 @@ ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB4_3: ; %T -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc @@ -878,7 +876,6 @@ ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB5_3: ; %T -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -462,9 +462,7 @@ ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc -; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc -; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V3-NEXT: s_waitcnt vmcnt(0) @@ -481,9 +479,7 @@ ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc -; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc -; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -500,9 +496,7 @@ ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc -; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc -; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc ; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1667,8 +1667,8 @@ ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v3 @@ -1771,7 +1771,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 @@ -1786,7 +1785,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 @@ -1800,7 +1798,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir @@ -72,3 +72,56 @@ $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s128)) ... +# (global_load + scratch_load + buffer_load) +--- +name: global_scratch_buffer +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-LABEL: name: global_scratch_buffer + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec +... +# waw between flat and buffer should have a wait inserted between. +# (flat + buffer) +--- +name: flat_buffer +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-LABEL: name: flat_buffer + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: S_WAITCNT 49279 + ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec +... +# buffer + flat +--- +name: buffer_flat +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-LABEL: name: buffer_flat + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT 3952 + ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr +...