diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1721,7 +1721,8 @@ for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { - if (SIInstrInfo::isVMEM(MI)) { + if (SIInstrInfo::isVMEM(MI) || + (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI))) { if (MI.mayLoad()) HasVMemLoad = true; if (MI.mayStore()) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -755,9 +755,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -824,9 +824,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -947,9 +947,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1022,9 +1023,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1065,9 +1067,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1134,9 +1137,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1224,9 +1228,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -10,9 +10,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -33,9 +34,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 @@ -69,9 +71,10 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 @@ -100,9 +103,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -317,9 +321,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -339,9 +344,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -369,9 +375,10 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -34,9 +34,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 @@ -62,9 +62,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll @@ -13,9 +13,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -46,9 +47,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -86,9 +88,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -124,9 +127,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -172,9 +176,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -209,9 +214,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -252,9 +258,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -292,9 +299,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -335,9 +343,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -366,9 +375,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -403,9 +413,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -439,9 +450,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -484,9 +496,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -519,9 +532,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -559,9 +573,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -597,9 +612,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -641,9 +657,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -674,9 +691,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -714,9 +732,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -752,9 +771,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -800,9 +820,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -837,9 +858,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -880,9 +902,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -920,9 +943,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -963,9 +987,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -994,9 +1019,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1031,9 +1057,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1067,9 +1094,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1112,9 +1140,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1147,9 +1176,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1187,9 +1217,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1225,9 +1256,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1269,9 +1301,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1302,9 +1335,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1342,9 +1376,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1380,9 +1415,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1428,9 +1464,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1465,9 +1502,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1508,9 +1546,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1548,9 +1587,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1591,9 +1631,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1622,9 +1663,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1659,9 +1701,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1695,9 +1738,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1740,9 +1784,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1775,9 +1820,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1815,9 +1861,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1853,9 +1900,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1897,9 +1945,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1930,9 +1979,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1970,9 +2020,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2008,9 +2059,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2056,9 +2108,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2093,9 +2146,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2136,9 +2190,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2176,9 +2231,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2219,9 +2275,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2250,9 +2307,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2287,9 +2345,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2323,9 +2382,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2368,9 +2428,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2403,9 +2464,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2443,9 +2505,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2481,9 +2544,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll @@ -15,10 +15,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -44,10 +45,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -71,10 +73,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -106,9 +109,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -139,9 +143,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -170,9 +175,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -214,10 +220,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -247,10 +254,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -278,10 +286,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -318,9 +327,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -355,9 +365,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -390,9 +401,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -429,10 +441,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -456,10 +469,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -483,10 +497,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -515,9 +530,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -546,9 +562,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -577,9 +594,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -618,10 +636,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -649,10 +668,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -680,10 +700,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -717,9 +738,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -752,9 +774,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -787,9 +810,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -827,10 +851,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,10 +881,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -883,10 +909,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -918,9 +945,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -951,9 +979,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -982,9 +1011,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1026,10 +1056,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1059,10 +1090,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1090,10 +1122,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1130,9 +1163,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1167,9 +1201,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1202,9 +1237,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1241,10 +1277,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1268,10 +1305,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1295,10 +1333,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1327,9 +1366,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -1358,9 +1398,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -1389,9 +1430,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1430,10 +1472,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1461,10 +1504,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1492,10 +1536,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1529,9 +1574,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1564,9 +1610,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1599,9 +1646,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1639,10 +1687,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1668,10 +1717,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1695,10 +1745,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1730,9 +1781,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1763,9 +1815,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1794,9 +1847,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1838,10 +1892,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1871,10 +1926,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1902,10 +1958,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1942,9 +1999,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1979,9 +2037,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2014,9 +2073,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2053,10 +2113,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2080,10 +2141,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2107,10 +2169,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2139,9 +2202,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -2170,9 +2234,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -2201,9 +2266,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2242,10 +2308,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2273,10 +2340,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2304,10 +2372,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2341,9 +2410,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2376,9 +2446,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2411,9 +2482,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2451,10 +2523,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2480,10 +2553,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2507,10 +2581,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2542,9 +2617,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2575,9 +2651,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2606,9 +2683,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2650,10 +2728,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2683,10 +2762,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2714,10 +2794,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2754,9 +2835,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2791,9 +2873,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2826,9 +2909,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2865,10 +2949,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2892,10 +2977,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2919,10 +3005,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2951,9 +3038,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -2982,9 +3070,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -3013,9 +3102,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -3054,10 +3144,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3085,10 +3176,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3116,10 +3208,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3153,9 +3246,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -3188,9 +3282,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -3223,9 +3318,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -843,9 +843,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -912,9 +912,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1035,9 +1035,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1110,9 +1111,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1153,9 +1155,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1222,9 +1225,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1314,9 +1318,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -62,7 +62,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -18,9 +18,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -42,9 +42,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -70,9 +70,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -106,9 +106,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -130,9 +130,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -158,9 +158,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -194,9 +194,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -216,9 +216,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -242,9 +242,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -274,9 +275,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -296,9 +297,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -322,9 +323,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -355,9 +357,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -383,9 +385,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -415,9 +417,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -455,9 +457,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -483,9 +485,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -515,9 +517,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -556,9 +558,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -581,9 +583,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -610,9 +612,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -645,9 +648,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -670,9 +673,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -699,9 +702,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -740,9 +744,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -764,9 +768,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -792,9 +796,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -828,9 +832,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -852,9 +856,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -880,9 +884,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -916,9 +920,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -938,9 +942,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -964,9 +968,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -996,9 +1001,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1018,9 +1023,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1044,9 +1049,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1077,9 +1083,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1105,9 +1111,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1137,9 +1143,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1177,9 +1183,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1205,9 +1211,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1237,9 +1243,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1278,9 +1284,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1303,9 +1309,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1332,9 +1338,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1367,9 +1374,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1392,9 +1399,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1421,9 +1428,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1462,9 +1470,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1486,9 +1494,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1514,9 +1522,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1550,9 +1558,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1574,9 +1582,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1602,9 +1610,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1638,9 +1646,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -1660,9 +1668,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1686,9 +1694,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1718,9 +1727,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1740,9 +1749,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1766,9 +1775,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1799,9 +1809,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1827,9 +1837,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1859,9 +1869,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1899,9 +1909,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1927,9 +1937,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1959,9 +1969,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2000,9 +2010,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2025,9 +2035,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2054,9 +2064,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2089,9 +2100,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2114,9 +2125,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2143,9 +2154,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2184,9 +2196,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2208,9 +2220,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2236,9 +2248,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2272,9 +2284,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2296,9 +2308,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2324,9 +2336,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2360,9 +2372,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -2382,9 +2394,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2408,9 +2420,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2440,9 +2453,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -2462,9 +2475,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2488,9 +2501,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2521,9 +2535,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2549,9 +2563,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2581,9 +2595,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2621,9 +2635,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2649,9 +2663,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2681,9 +2695,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2722,9 +2736,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2747,9 +2761,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2776,9 +2790,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2811,9 +2826,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2836,9 +2851,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2865,9 +2880,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -572,3 +572,125 @@ S_ENDPGM 0 ... +--- + +# The loop contains a global store, and uses a (global) loaded value outside of the loop. + +# GFX9-LABEL: waitcnt_vm_loop_global_mem +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_global_mem +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: + +name: waitcnt_vm_loop_global_mem +body: | + bb.0: + successors: %bb.1 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +... +--- + +# Same as above case, but use scratch memory instructions instead + +# GFX9-LABEL: waitcnt_vm_loop_scratch_mem +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_scratch_mem +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: + +name: waitcnt_vm_loop_scratch_mem +body: | + bb.0: + successors: %bb.1 + $vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +... +--- + +# Same as above case, but use flat memory instructions instead + +# GFX9-LABEL: waitcnt_vm_loop_flat_mem +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_flat_mem +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 11 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 11 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_flat_mem +body: | + bb.0: + successors: %bb.1 + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +...