diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -398,6 +398,7 @@ bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets); + bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -1703,6 +1704,11 @@ return UpdateCache(false); } +bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { + return SIInstrInfo::isVMEM(MI) || + (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); +} + // Return true if it is better to flush the vmcnt counter in the preheader of // the given loop. We currently decide to flush in two situations: // 1. The loop contains vmem store(s), no vmem load and at least one use of a @@ -1721,8 +1727,7 @@ for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { - if (SIInstrInfo::isVMEM(MI) || - (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI))) { + if (isVMEMOrFlatVMEM(MI)) { if (MI.mayLoad()) HasVMemLoad = true; if (MI.mayStore()) @@ -1750,7 +1755,7 @@ } } // VMem load vgpr def - else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { // If we find a register that is loaded inside the loop, 1. and 2. // are invalidated and we can exit. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -755,9 +755,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -824,9 +824,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -947,10 +947,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1023,10 +1022,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1067,10 +1065,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1137,10 +1134,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1228,10 +1224,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -10,10 +10,9 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -34,10 +33,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 @@ -71,10 +69,9 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 -; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 @@ -103,10 +100,9 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -321,10 +317,9 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -344,10 +339,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -375,10 +369,9 @@ ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100-NEXT: flat_load_b32 v3, v[0:1] ; GFX1100-NEXT: s_mov_b32 s0, 0 -; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v4, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v3, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -34,9 +34,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 @@ -62,10 +62,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll @@ -13,10 +13,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -47,10 +46,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -88,10 +86,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -127,10 +124,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -176,10 +172,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -214,10 +209,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -258,10 +252,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -299,10 +292,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -343,10 +335,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -375,10 +366,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -413,10 +403,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -450,10 +439,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] @@ -496,10 +484,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -532,10 +519,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -573,10 +559,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -612,10 +597,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] @@ -657,10 +641,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -691,10 +674,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -732,10 +714,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -771,10 +752,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -820,10 +800,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -858,10 +837,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -902,10 +880,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -943,10 +920,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -987,10 +963,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1019,10 +994,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1057,10 +1031,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1094,10 +1067,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] @@ -1140,10 +1112,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1176,10 +1147,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1217,10 +1187,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1256,10 +1225,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] @@ -1301,10 +1269,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1335,10 +1302,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1376,10 +1342,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1415,10 +1380,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1464,10 +1428,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1502,10 +1465,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1546,10 +1508,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1587,10 +1548,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1631,10 +1591,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1663,10 +1622,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1701,10 +1659,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1738,10 +1695,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[2:3] @@ -1784,10 +1740,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1820,10 +1775,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -1861,10 +1815,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1900,10 +1853,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] @@ -1945,10 +1897,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -1979,10 +1930,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2020,10 +1970,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2059,10 +2008,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2108,10 +2056,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2146,10 +2093,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2190,10 +2136,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2231,10 +2176,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2275,10 +2219,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2307,10 +2250,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2345,10 +2287,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2382,10 +2323,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[2:3] @@ -2428,10 +2368,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v6, s2 @@ -2464,10 +2403,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v6, s2 @@ -2505,10 +2443,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] @@ -2544,10 +2481,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll @@ -15,11 +15,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -45,11 +44,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -73,11 +71,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -109,10 +106,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -143,10 +139,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -175,10 +170,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -220,11 +214,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -254,11 +247,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -286,11 +278,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -327,10 +318,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -365,10 +355,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -401,10 +390,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -441,11 +429,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -469,11 +456,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -497,11 +483,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -530,10 +515,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -562,10 +546,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -594,10 +577,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -636,11 +618,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -668,11 +649,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -700,11 +680,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -738,10 +717,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -774,10 +752,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -810,10 +787,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -851,11 +827,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -881,11 +856,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -909,11 +883,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -945,10 +918,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -979,10 +951,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1011,10 +982,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1056,11 +1026,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1090,11 +1059,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1122,11 +1090,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1163,10 +1130,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1201,10 +1167,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1237,10 +1202,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1277,11 +1241,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1305,11 +1268,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1333,11 +1295,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1366,10 +1327,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -1398,10 +1358,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -1430,10 +1389,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1472,11 +1430,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1504,11 +1461,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1536,11 +1492,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1574,10 +1529,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1610,10 +1564,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1646,10 +1599,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -1687,11 +1639,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1717,11 +1668,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1745,11 +1695,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1781,10 +1730,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -1815,10 +1763,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -1847,10 +1794,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -1892,11 +1838,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1926,11 +1871,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1958,11 +1902,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1999,10 +1942,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2037,10 +1979,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2073,10 +2014,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2113,11 +2053,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2141,11 +2080,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2169,11 +2107,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2202,10 +2139,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -2234,10 +2170,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -2266,10 +2201,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2308,11 +2242,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2340,11 +2273,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2372,11 +2304,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2410,10 +2341,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2446,10 +2376,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2482,10 +2411,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2523,11 +2451,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2553,11 +2480,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2581,11 +2507,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2617,10 +2542,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2651,10 +2575,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2683,10 +2606,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -2728,11 +2650,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2762,11 +2683,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2794,11 +2714,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2835,10 +2754,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -2873,10 +2791,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -2909,10 +2826,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -2949,11 +2865,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2977,11 +2892,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3005,11 +2919,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3038,10 +2951,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 @@ -3070,10 +2982,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 @@ -3102,10 +3013,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 @@ -3144,11 +3054,10 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3176,11 +3085,10 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3208,11 +3116,10 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v0, s4, v1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3246,10 +3153,9 @@ ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v0, v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -3282,10 +3188,9 @@ ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v0, v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -3318,10 +3223,9 @@ ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v0, v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -843,9 +843,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -912,9 +912,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1035,10 +1035,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1111,10 +1110,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 @@ -1155,10 +1153,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1225,10 +1222,9 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 ; GFX90A-NEXT: buffer_wbl2 @@ -1318,10 +1314,9 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -62,6 +62,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -18,9 +18,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -42,9 +42,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -70,9 +70,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -106,9 +106,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -130,9 +130,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -158,9 +158,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 @@ -194,9 +194,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -216,9 +216,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -242,10 +242,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -275,9 +274,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -297,9 +296,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -323,10 +322,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -357,9 +355,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -385,9 +383,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -417,9 +415,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -457,9 +455,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -485,9 +483,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] @@ -517,9 +515,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -558,9 +556,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -583,9 +581,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -612,10 +610,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -648,9 +645,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -673,9 +670,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -702,10 +699,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -744,9 +740,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -768,9 +764,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -796,9 +792,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -832,9 +828,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,9 +852,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -884,9 +880,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 @@ -920,9 +916,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -942,9 +938,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -968,10 +964,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1001,9 +996,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1023,9 +1018,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1049,10 +1044,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1083,9 +1077,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1111,9 +1105,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1143,9 +1137,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1183,9 +1177,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1211,9 +1205,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] @@ -1243,9 +1237,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1284,9 +1278,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1309,9 +1303,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1338,10 +1332,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1374,9 +1367,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1399,9 +1392,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1428,10 +1421,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -1470,9 +1462,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1494,9 +1486,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1522,9 +1514,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1558,9 +1550,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1582,9 +1574,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1610,9 +1602,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 @@ -1646,9 +1638,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -1668,9 +1660,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1694,10 +1686,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1727,9 +1718,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -1749,9 +1740,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1775,10 +1766,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1809,9 +1799,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1837,9 +1827,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1869,9 +1859,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1909,9 +1899,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1937,9 +1927,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] @@ -1969,9 +1959,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2010,9 +2000,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2035,9 +2025,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2064,10 +2054,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2100,9 +2089,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2125,9 +2114,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2154,10 +2143,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2196,9 +2184,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2220,9 +2208,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2248,9 +2236,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2284,9 +2272,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2308,9 +2296,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2336,9 +2324,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 @@ -2372,9 +2360,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc @@ -2394,9 +2382,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2420,10 +2408,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2453,9 +2440,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc @@ -2475,9 +2462,9 @@ ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2501,10 +2488,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2535,9 +2521,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2563,9 +2549,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2595,9 +2581,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2635,9 +2621,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2663,9 +2649,9 @@ ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] @@ -2695,9 +2681,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v10, v4 ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2736,9 +2722,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2761,9 +2747,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2790,10 +2776,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2826,9 +2811,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2851,9 +2836,9 @@ ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1] ; GFX10-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc @@ -2880,10 +2865,9 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -694,3 +694,44 @@ S_ENDPGM 0 ... +--- + +# The loop contains a store, a load, and uses values loaded both inside and +# outside the loop. +# We do not expect the waitcnt to be hoisted out of the loop. + +# GFX9-LABEL: waitcnt_vm_loop_flat_load +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_flat_load +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_flat_load +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec + $vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +...