diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1757,22 +1757,24 @@ // 1. The loop contains vmem store(s), no vmem load and at least one use of a // vgpr containing a value that is loaded outside of the loop. (Only on // targets with no vscnt counter). -// 2. The loop contains vmem load(s), but the loaded values are not used in the -// loop, and at least one use of a vgpr containing a value that is loaded -// outside of the loop. +// 2. The loop contains vmem load(s), and at least one use of a vgpr containing +// a value that is loaded outside of the loop. bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets) { bool HasVMemLoad = false; bool HasVMemStore = false; bool UsesVgprLoadedOutside = false; - DenseSet VgprUse; - DenseSet VgprDef; for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { - if (MI.mayLoad()) + if (MI.mayLoad()) { + // Early exit if the loop contains a vmem load and uses a value loaded + // outside the loop. + if (UsesVgprLoadedOutside) + return true; HasVMemLoad = true; + } if (MI.mayStore()) HasVMemStore = true; } @@ -1784,34 +1786,24 @@ // Vgpr use if (Op.isUse()) { for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprDef.contains(RegNo)) - return false; - VgprUse.insert(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { + if (Brackets.getRegScore(RegNo, VM_CNT) > + Brackets.getScoreLB(VM_CNT)) { + // Early exit if the loop contains a vmem load and uses a value + // loaded outside the loop. + if (HasVMemLoad) + return true; UsesVgprLoadedOutside = true; break; } } } - // VMem load vgpr def - else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprUse.contains(RegNo)) - return false; - VgprDef.insert(RegNo); - } } } } - if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) - return true; - return HasVMemLoad && UsesVgprLoadedOutside; + return !ST->hasVscnt() && HasVMemStore && !HasVMemLoad && + UsesVgprLoadedOutside; } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1259,9 +1259,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1328,9 +1328,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1451,9 +1451,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1526,9 +1527,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1569,9 +1571,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1638,9 +1641,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1728,9 +1732,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -10,9 +10,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -33,9 +34,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 @@ -69,9 +71,10 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 @@ -100,9 +103,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -317,9 +321,10 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -339,9 +344,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -369,9 +375,10 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -34,9 +34,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 @@ -62,9 +62,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll @@ -13,9 +13,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -46,9 +47,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -86,9 +88,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -124,9 +127,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -172,9 +176,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -209,9 +214,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -252,9 +258,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -292,9 +299,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -335,9 +343,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -366,9 +375,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -403,9 +413,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -439,9 +450,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -484,9 +496,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -519,9 +532,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -559,9 +573,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -597,9 +612,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -641,9 +657,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -674,9 +691,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -714,9 +732,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -752,9 +771,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -800,9 +820,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -837,9 +858,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -880,9 +902,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -920,9 +943,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -963,9 +987,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -994,9 +1019,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1031,9 +1057,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1067,9 +1094,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1112,9 +1140,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1147,9 +1176,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1187,9 +1217,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1225,9 +1256,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1269,9 +1301,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1302,9 +1335,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1342,9 +1376,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1380,9 +1415,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1428,9 +1464,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1465,9 +1502,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1508,9 +1546,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1548,9 +1587,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1591,9 +1631,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1622,9 +1663,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1659,9 +1701,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1695,9 +1738,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1740,9 +1784,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1775,9 +1820,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1815,9 +1861,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1853,9 +1900,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1897,9 +1945,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1930,9 +1979,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1970,9 +2020,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2008,9 +2059,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2056,9 +2108,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2093,9 +2146,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2136,9 +2190,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2176,9 +2231,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2219,9 +2275,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2250,9 +2307,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2287,9 +2345,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2323,9 +2382,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2368,9 +2428,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2403,9 +2464,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2443,9 +2505,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2481,9 +2544,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll @@ -15,10 +15,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -44,10 +45,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -71,10 +73,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -106,9 +109,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -139,9 +143,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -170,9 +175,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -214,10 +220,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -247,10 +254,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -278,10 +286,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -318,9 +327,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -355,9 +365,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -390,9 +401,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -429,10 +441,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -456,10 +469,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -483,10 +497,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -515,9 +530,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -546,9 +562,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -577,9 +594,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -618,10 +636,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -649,10 +668,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -680,10 +700,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -717,9 +738,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -752,9 +774,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -787,9 +810,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -827,10 +851,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,10 +881,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -883,10 +909,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -918,9 +945,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -951,9 +979,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -982,9 +1011,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1026,10 +1056,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1059,10 +1090,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1090,10 +1122,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1130,9 +1163,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1167,9 +1201,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1202,9 +1237,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1241,10 +1277,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1268,10 +1305,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1295,10 +1333,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1327,9 +1366,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -1358,9 +1398,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -1389,9 +1430,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1430,10 +1472,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1461,10 +1504,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1492,10 +1536,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1529,9 +1574,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1564,9 +1610,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1599,9 +1646,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1639,10 +1687,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1668,10 +1717,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1695,10 +1745,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1730,9 +1781,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1763,9 +1815,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1794,9 +1847,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1838,10 +1892,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1871,10 +1926,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1902,10 +1958,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1942,9 +1999,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1979,9 +2037,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2014,9 +2073,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2053,10 +2113,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2080,10 +2141,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2107,10 +2169,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2139,9 +2202,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -2170,9 +2234,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -2201,9 +2266,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2242,10 +2308,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2273,10 +2340,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2304,10 +2372,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2341,9 +2410,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2376,9 +2446,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2411,9 +2482,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2451,10 +2523,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2480,10 +2553,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2507,10 +2581,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2542,9 +2617,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2575,9 +2651,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2606,9 +2683,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2650,10 +2728,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2683,10 +2762,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2714,10 +2794,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2754,9 +2835,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2791,9 +2873,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2826,9 +2909,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2865,10 +2949,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2892,10 +2977,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2919,10 +3005,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2951,9 +3038,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -2982,9 +3070,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -3013,9 +3102,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -3054,10 +3144,11 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3085,10 +3176,11 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3116,10 +3208,11 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3153,9 +3246,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -3188,9 +3282,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -3223,9 +3318,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1347,9 +1347,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1416,9 +1416,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1539,9 +1539,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1614,9 +1615,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1657,9 +1659,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1726,9 +1729,10 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1818,9 +1822,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -62,7 +62,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -18,9 +18,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -42,9 +42,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -70,9 +70,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -106,9 +106,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -130,9 +130,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -158,9 +158,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -194,9 +194,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -216,9 +216,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -242,9 +242,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -274,9 +275,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -296,9 +297,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -322,9 +323,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -355,9 +357,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -383,9 +385,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -415,9 +417,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -455,9 +457,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -483,9 +485,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -515,9 +517,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -556,9 +558,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -581,9 +583,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -610,9 +612,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -645,9 +648,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -670,9 +673,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -699,9 +702,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -740,9 +744,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -764,9 +768,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -792,9 +796,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -828,9 +832,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -852,9 +856,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -880,9 +884,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -916,9 +920,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -938,9 +942,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -964,9 +968,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -996,9 +1001,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1018,9 +1023,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1044,9 +1049,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1077,9 +1083,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1105,9 +1111,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1137,9 +1143,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1177,9 +1183,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1205,9 +1211,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1237,9 +1243,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1278,9 +1284,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1303,9 +1309,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1332,9 +1338,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1367,9 +1374,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1392,9 +1399,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1421,9 +1428,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1462,9 +1470,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1486,9 +1494,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1514,9 +1522,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1550,9 +1558,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1574,9 +1582,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1602,9 +1610,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1638,9 +1646,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -1660,9 +1668,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1686,9 +1694,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1718,9 +1727,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1740,9 +1749,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1766,9 +1775,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1799,9 +1809,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1827,9 +1837,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1859,9 +1869,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1899,9 +1909,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1927,9 +1937,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1959,9 +1969,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2000,9 +2010,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2025,9 +2035,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2054,9 +2064,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2089,9 +2100,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2114,9 +2125,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2143,9 +2154,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2184,9 +2196,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2208,9 +2220,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2236,9 +2248,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2272,9 +2284,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2296,9 +2308,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2324,9 +2336,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2360,9 +2372,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -2382,9 +2394,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2408,9 +2420,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2440,9 +2453,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -2462,9 +2475,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2488,9 +2501,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2521,9 +2535,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2549,9 +2563,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2581,9 +2595,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2621,9 +2635,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2649,9 +2663,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2681,9 +2695,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2722,9 +2736,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2747,9 +2761,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2776,9 +2790,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2811,9 +2826,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2836,9 +2851,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2865,9 +2880,10 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -31,9 +31,10 @@ ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_max_i32_e32 v3, s2, v4 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v6, v4 @@ -91,9 +92,10 @@ ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_max_i32_e32 v3, s2, v4 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -114,18 +114,18 @@ # The loop contains a store, a load, and uses values loaded both inside and # outside the loop. -# We do not expect the waitcnt to be hoisted out of the loop. +# We expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop_load # GFX9-LABEL: bb.0: -# GFX9-NOT: S_WAITCNT 39 +# GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_load # GFX10-LABEL: bb.0: -# GFX10-NOT: S_WAITCNT 16 +# GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: @@ -319,18 +319,18 @@ --- # Same as loop2 but the value loaded inside the loop is also used in the loop. -# We do not expect the waitcnt to be hoisted out of the loop. +# We expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop2_use_in_loop # GFX9-LABEL: bb.0: -# GFX9-NOT: S_WAITCNT 39 +# GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_use_in_loop # GFX10-LABEL: bb.0: -# GFX10-NOT: S_WAITCNT 16 +# GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: @@ -456,14 +456,14 @@ # GFX9-LABEL: waitcnt_vm_loop2_reginterval2 # GFX9-LABEL: bb.0: -# GFX9-NOT: S_WAITCNT 39 +# GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop2_reginterval2 # GFX10-LABEL: bb.0: -# GFX10-NOT: S_WAITCNT 16 +# GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: @@ -698,18 +698,18 @@ # The loop contains a store, a load, and uses values loaded both inside and # outside the loop. -# We do not expect the waitcnt to be hoisted out of the loop. +# We expect the waitcnt to be hoisted out of the loop. # GFX9-LABEL: waitcnt_vm_loop_flat_load # GFX9-LABEL: bb.0: -# GFX9-NOT: S_WAITCNT 39 +# GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.1: # GFX9: S_WAITCNT 39 # GFX9-LABEL: bb.2: # GFX10-LABEL: waitcnt_vm_loop_flat_load # GFX10-LABEL: bb.0: -# GFX10-NOT: S_WAITCNT 16 +# GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: