diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -980,11 +980,17 @@ // 2) If a destination operand that was used by a recent export/store ins, // add s_waitcnt on exp_cnt to guarantee the WAR order. for (const MachineMemOperand *Memop : MI.memoperands()) { - const Value *Ptr = Memop->getValue(); - if (Memop->isStore() && SLoadAddresses.count(Ptr)) { - addWait(Wait, LGKM_CNT, 0); - if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) - SLoadAddresses.erase(Ptr); + if (Memop->isStore()) { + const Value *Ptr = Memop->getValue(); + if (!Ptr) { + // Conservatively add the s_waitcnt in the absence of memop data. + addWait(Wait, LGKM_CNT, 0); + } else if (SLoadAddresses.count(Ptr)) { + addWait(Wait, LGKM_CNT, 0); + if (PDT->dominates(MI.getParent(), + SLoadAddresses.find(Ptr)->second)) + SLoadAddresses.erase(Ptr); + } } unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) @@ -1480,7 +1486,8 @@ if (TII->isSMRD(Inst)) { for (const MachineMemOperand *Memop : Inst.memoperands()) { const Value *Ptr = Memop->getValue(); - SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + if (Ptr) + SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); } if (ST->hasReadVCCZBug()) { // This smem read could complete and clobber vccz at any time. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -37,6 +37,7 @@ ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 ; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:260 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:264 @@ -117,7 +118,6 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:440 ; GCN-NEXT: v_mov_b32_e32 v0, s27 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:444 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:448 ; GCN-NEXT: v_mov_b32_e32 v0, s37 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5732,8 +5732,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, s22 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -5823,13 +5823,13 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -46,9 +46,8 @@ ; CHECK-NEXT: ds_read_b32 v7, v10 ; CHECK-NEXT: ds_read_b32 v6, v1 ; CHECK-NEXT: ds_read_b32 v5, v5 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc ; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc ; CHECK-NEXT: s_endpgm %load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4 @@ -84,9 +83,8 @@ ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc ; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: ds_read_b32 v0, v2 @@ -97,9 +95,8 @@ ; CHECK-NEXT: ds_read_b32 v4, v10 ; CHECK-NEXT: s_waitcnt lgkmcnt(5) ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -763,6 +763,7 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 @@ -771,7 +772,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 BB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 @@ -998,6 +998,7 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 @@ -1006,7 +1007,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 BB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir @@ -12,6 +12,6 @@ liveins: $sgpr0_sgpr1 $sgpr4 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1) + $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst 4 on `i32 addrspace(1)* undef`) S_CMP_LG_U32 killed $sgpr4, 0, implicit-def $scc ...